diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -509,9 +509,7 @@ MachineBasicBlock::iterator MI, const DebugLoc &DL, MCRegister DestReg, MCRegister SrcReg, bool KillSrc, - RegScavenger &RS, - Register ImpDefSuperReg = Register(), - Register ImpUseSuperReg = Register()) { + RegScavenger &RS) { const SIRegisterInfo &RI = TII.getRegisterInfo(); assert(AMDGPU::SReg_32RegClass.contains(SrcReg) || @@ -542,17 +540,8 @@ DefOp.setIsKill(false); } - MachineInstrBuilder Builder = - BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) + BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) .add(DefOp); - if (ImpDefSuperReg) - Builder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); - - if (ImpUseSuperReg) { - Builder.addReg(ImpUseSuperReg, - getKillRegState(KillSrc) | RegState::Implicit); - } - return; } @@ -593,19 +582,11 @@ assert(AMDGPU::SReg_32RegClass.contains(SrcReg)); } - MachineInstrBuilder UseBuilder = BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) + BuildMI(MBB, MI, DL, TII.get(TmpCopyOp), Tmp) .addReg(SrcReg, getKillRegState(KillSrc)); - if (ImpUseSuperReg) { - UseBuilder.addReg(ImpUseSuperReg, - getKillRegState(KillSrc) | RegState::Implicit); - } - MachineInstrBuilder DefBuilder - = BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) + BuildMI(MBB, MI, DL, TII.get(AMDGPU::V_ACCVGPR_WRITE_B32_e64), DestReg) .addReg(Tmp, RegState::Kill); - - if (ImpDefSuperReg) - DefBuilder.addReg(ImpDefSuperReg, RegState::Define | RegState::Implicit); } static void expandSGPRCopy(const SIInstrInfo &TII, MachineBasicBlock &MBB, @@ -615,7 +596,6 @@ const SIRegisterInfo &RI = TII.getRegisterInfo(); ArrayRef BaseIndices = RI.getRegSplitParts(RC, 4); MachineBasicBlock::iterator I = MI; - MachineInstr *FirstMI = nullptr, *LastMI = nullptr; for (unsigned Idx = 0; Idx < BaseIndices.size(); ++Idx) { int16_t SubIdx = BaseIndices[Idx]; @@ -634,26 +614,12 @@ Idx++; } - LastMI = BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) - .addReg(RI.getSubReg(SrcReg, SubIdx)) - .addReg(SrcReg, RegState::Implicit); - - if (!FirstMI) - FirstMI = LastMI; + BuildMI(MBB, I, DL, TII.get(Opcode), RI.getSubReg(DestReg, SubIdx)) + .addReg(RI.getSubReg(SrcReg, SubIdx), getKillRegState(KillSrc)); if (!Forward) I--; } - - assert(FirstMI && LastMI); - if (!Forward) - std::swap(FirstMI, LastMI); - - FirstMI->addOperand( - MachineOperand::CreateReg(DestReg, true /*IsDef*/, true /*IsImp*/)); - - if (KillSrc) - LastMI->addRegisterKilled(SrcReg, &RI); } void SIInstrInfo::copyPhysReg(MachineBasicBlock &MBB, @@ -873,15 +839,14 @@ if (ST.hasPackedFP32Ops()) { BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DestReg) .addImm(SISrcMods::OP_SEL_1) - .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)) .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) - .addReg(SrcReg) + .addReg(SrcReg, getKillRegState(KillSrc)) .addImm(0) // op_sel_lo .addImm(0) // op_sel_hi .addImm(0) // neg_lo .addImm(0) // neg_hi - .addImm(0) // clamp - .addReg(SrcReg, getKillRegState(KillSrc) | RegState::Implicit); + .addImm(0); // clamp return; } } @@ -928,6 +893,7 @@ // instruction, since it will also kill the components made live by this def. const bool CanKillSuperReg = KillSrc && !RI.regsOverlap(SrcReg, DestReg); + MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo(); for (unsigned Idx = 0; Idx < SubIndices.size(); ++Idx) { unsigned SubIdx; if (Forward) @@ -937,37 +903,29 @@ bool UseKill = CanKillSuperReg && Idx == SubIndices.size() - 1; + Register DstSubReg = RI.getSubReg(DestReg, SubIdx); + Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); + + if (LiveRegs.available(MRI, SrcSubReg)) + continue; + if (Opcode == AMDGPU::INSTRUCTION_LIST_END) { - Register ImpDefSuper = Idx == 0 ? Register(DestReg) : Register(); - Register ImpUseSuper = SrcReg; - indirectCopyToAGPR(*this, MBB, MI, DL, RI.getSubReg(DestReg, SubIdx), - RI.getSubReg(SrcReg, SubIdx), UseKill, *RS, - ImpDefSuper, ImpUseSuper); + indirectCopyToAGPR(*this, MBB, MI, DL, DstSubReg, + SrcSubReg, UseKill, *RS); } else if (Opcode == AMDGPU::V_PK_MOV_B32) { - Register DstSubReg = RI.getSubReg(DestReg, SubIdx); - Register SrcSubReg = RI.getSubReg(SrcReg, SubIdx); - MachineInstrBuilder MIB = - BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) + BuildMI(MBB, MI, DL, get(AMDGPU::V_PK_MOV_B32), DstSubReg) .addImm(SISrcMods::OP_SEL_1) - .addReg(SrcSubReg) + .addReg(SrcSubReg, getKillRegState(UseKill)) .addImm(SISrcMods::OP_SEL_0 | SISrcMods::OP_SEL_1) - .addReg(SrcSubReg) + .addReg(SrcSubReg, getKillRegState(UseKill)) .addImm(0) // op_sel_lo .addImm(0) // op_sel_hi .addImm(0) // neg_lo .addImm(0) // neg_hi - .addImm(0) // clamp - .addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); - if (Idx == 0) - MIB.addReg(DestReg, RegState::Define | RegState::Implicit); + .addImm(0); // clamp } else { - MachineInstrBuilder Builder = - BuildMI(MBB, MI, DL, get(Opcode), RI.getSubReg(DestReg, SubIdx)) - .addReg(RI.getSubReg(SrcReg, SubIdx)); - if (Idx == 0) - Builder.addReg(DestReg, RegState::Define | RegState::Implicit); - - Builder.addReg(SrcReg, getKillRegState(UseKill) | RegState::Implicit); + BuildMI(MBB, MI, DL, get(Opcode), DstSubReg) + .addReg(SrcSubReg, getKillRegState(UseKill)); } } } @@ -1749,11 +1707,9 @@ .addImm(0); // clamp } else { BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addImm(Lo.getSExtValue()) - .addReg(Dst, RegState::Implicit | RegState::Define); + .addImm(Lo.getSExtValue()); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addImm(Hi.getSExtValue()) - .addReg(Dst, RegState::Implicit | RegState::Define); + .addImm(Hi.getSExtValue()); } } else { assert(SrcOp.isReg()); @@ -1771,11 +1727,9 @@ .addImm(0); // clamp } else { BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstLo) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)) - .addReg(Dst, RegState::Implicit | RegState::Define); + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub0)); BuildMI(MBB, MI, DL, get(AMDGPU::V_MOV_B32_e32), DstHi) - .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)) - .addReg(Dst, RegState::Implicit | RegState::Define); + .addReg(RI.getSubReg(SrcOp.getReg(), AMDGPU::sub1)); } } MI.eraseFromParent(); @@ -1801,11 +1755,9 @@ APInt Lo(32, Imm.getLoBits(32).getZExtValue()); APInt Hi(32, Imm.getHiBits(32).getZExtValue()); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstLo) - .addImm(Lo.getSExtValue()) - .addReg(Dst, RegState::Implicit | RegState::Define); + .addImm(Lo.getSExtValue()); BuildMI(MBB, MI, DL, get(AMDGPU::S_MOV_B32), DstHi) - .addImm(Hi.getSExtValue()) - .addReg(Dst, RegState::Implicit | RegState::Define); + .addImm(Hi.getSExtValue()); MI.eraseFromParent(); break; } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/atomic_optimizations_mul_one.ll @@ -89,8 +89,8 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_add_i32_e32 v4, vcc, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -183,8 +183,8 @@ ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_sub_i32_e32 v4, vcc, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 @@ -283,8 +283,8 @@ ; GCN-NEXT: v_readfirstlane_b32 s4, v1 ; GCN-NEXT: v_and_b32_e32 v0, 1, v0 ; GCN-NEXT: v_xor_b32_e32 v4, s4, v0 -; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 +; GCN-NEXT: s_waitcnt expcnt(0) ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/cvt_f32_ubyte.ll @@ -472,15 +472,15 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -656,11 +656,11 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u32_e32 v0, vcc, 2, v0 ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -703,10 +703,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -744,15 +744,15 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: v_ashrrev_i32_e32 v3, 31, v0 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_mov_b32_e32 v1, s0 +; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v1, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, v2, v3, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -861,10 +861,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -906,10 +906,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_1 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -952,10 +952,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte0_sdwa v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_2 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -997,10 +997,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_ubyte3_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement-stack-lower.ll @@ -456,9 +456,9 @@ ; GCN-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:572 ; 4-byte Folded Reload ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v4, v7 +; GCN-NEXT: v_mov_b32_e32 v7, v10 ; GCN-NEXT: v_mov_b32_e32 v5, v8 ; GCN-NEXT: v_mov_b32_e32 v6, v9 -; GCN-NEXT: v_mov_b32_e32 v7, v10 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:464 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:468 ; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:472 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -66,8 +66,8 @@ ; GFX8-NEXT: s_lshl_b32 s0, s2, 1 ; GFX8-NEXT: s_lshl_b32 m0, s0, 1 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: v_movrels_b32_e32 v1, v3 ; GFX8-NEXT: v_movrels_b32_e32 v0, v2 +; GFX8-NEXT: v_movrels_b32_e32 v1, v3 ; GFX8-NEXT: v_mov_b32_e32 v3, v1 ; GFX8-NEXT: v_mov_b32_e32 v2, v0 ; GFX8-NEXT: v_readfirstlane_b32 s0, v0 @@ -88,8 +88,8 @@ ; GFX7-NEXT: s_lshl_b32 s0, s2, 1 ; GFX7-NEXT: s_lshl_b32 m0, s0, 1 ; GFX7-NEXT: s_waitcnt vmcnt(0) -; GFX7-NEXT: v_movrels_b32_e32 v1, v3 ; GFX7-NEXT: v_movrels_b32_e32 v0, v2 +; GFX7-NEXT: v_movrels_b32_e32 v1, v3 ; GFX7-NEXT: v_mov_b32_e32 v3, v1 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: v_readfirstlane_b32 s0, v0 @@ -108,14 +108,14 @@ ; GFX10-NEXT: s_lshl_b32 s0, s2, 1 ; GFX10-NEXT: s_lshl_b32 m0, s0, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_movrels_b32_e32 v1, v3 ; GFX10-NEXT: v_movrels_b32_e32 v0, v2 -; GFX10-NEXT: v_mov_b32_e32 v3, v1 +; GFX10-NEXT: v_movrels_b32_e32 v1, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: v_mov_b32_e32 v3, v1 ; GFX10-NEXT: v_readfirstlane_b32 s0, v0 ; GFX10-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 ; GFX10-NEXT: ; return to shader part epilog %vector = load <4 x i128>, <4 x i128> addrspace(1)* %ptr %element = extractelement <4 x i128> %vector, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.ll @@ -2751,9 +2751,9 @@ ; GPRIDX-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; GPRIDX-NEXT: s_load_dword s8, s[4:5], 0x8 ; GPRIDX-NEXT: s_mov_b32 s0, 0 -; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000 ; GPRIDX-NEXT: s_mov_b32 s3, 0x40080000 ; GPRIDX-NEXT: s_mov_b32 s2, s0 +; GPRIDX-NEXT: s_mov_b32 s1, 0x40140000 ; GPRIDX-NEXT: s_waitcnt lgkmcnt(0) ; GPRIDX-NEXT: s_cmp_eq_u32 s8, 1 ; GPRIDX-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 @@ -2842,9 +2842,9 @@ ; MOVREL-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; MOVREL-NEXT: s_load_dword s8, s[4:5], 0x8 ; MOVREL-NEXT: s_mov_b32 s0, 0 -; MOVREL-NEXT: s_mov_b32 s1, 0x40140000 ; MOVREL-NEXT: s_mov_b32 s3, 0x40080000 ; MOVREL-NEXT: s_mov_b32 s2, s0 +; MOVREL-NEXT: s_mov_b32 s1, 0x40140000 ; MOVREL-NEXT: s_waitcnt lgkmcnt(0) ; MOVREL-NEXT: s_cmp_eq_u32 s8, 1 ; MOVREL-NEXT: s_cselect_b64 s[4:5], 2.0, 1.0 @@ -2855,8 +2855,8 @@ ; MOVREL-NEXT: s_cmp_eq_u32 s8, 4 ; MOVREL-NEXT: s_cselect_b64 s[0:1], s[0:1], s[2:3] ; MOVREL-NEXT: v_mov_b32_e32 v0, s0 -; MOVREL-NEXT: v_mov_b32_e32 v2, s6 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 +; MOVREL-NEXT: v_mov_b32_e32 v2, s6 ; MOVREL-NEXT: v_mov_b32_e32 v3, s7 ; MOVREL-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; MOVREL-NEXT: s_endpgm @@ -2935,9 +2935,9 @@ ; GFX10-NEXT: s_load_dword s8, s[4:5], 0x8 ; GFX10-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX10-NEXT: s_mov_b32 s2, 0 -; GFX10-NEXT: s_mov_b32 s3, 0x40140000 ; GFX10-NEXT: s_mov_b32 s5, 0x40080000 ; GFX10-NEXT: s_mov_b32 s4, s2 +; GFX10-NEXT: s_mov_b32 s3, 0x40140000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cmp_eq_u32 s8, 1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/fmed3.ll @@ -51,8 +51,8 @@ ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: flat_load_dword v2, v[4:5] glc ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v6, s0 +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_add_u32_e32 v6, vcc, v6, v8 ; VI-NEXT: v_addc_u32_e32 v7, vcc, 0, v7, vcc ; VI-NEXT: v_sub_f32_e32 v0, 0x80000000, v0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/frem.ll @@ -835,11 +835,11 @@ ; VI-NEXT: v_fma_f32 v2, -v2, v5, v3 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v2, v2, v4, v5 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_div_fixup_f32 v2, v2, v1, s3 ; VI-NEXT: v_trunc_f32_e32 v2, v2 ; VI-NEXT: v_fma_f32 v1, -v2, v1, s3 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm %gep2 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in2, i32 4 @@ -997,11 +997,11 @@ ; VI-NEXT: v_fma_f32 v4, -v4, v7, v5 ; VI-NEXT: s_setreg_imm32_b32 hwreg(HW_REG_MODE, 4, 2), 0 ; VI-NEXT: v_div_fmas_f32 v4, v4, v6, v7 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_div_fixup_f32 v4, v4, v3, s3 ; VI-NEXT: v_trunc_f32_e32 v4, v4 ; VI-NEXT: v_fma_f32 v3, -v4, v3, s3 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %gep2 = getelementptr <4 x float>, <4 x float> addrspace(1)* %in2, i32 4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -111,10 +111,10 @@ ; GFX8-NEXT: s_lshl_b32 s0, s0, s1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: s_lshl_b32 s2, s2, s1 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -129,10 +129,10 @@ ; GFX7-NEXT: s_lshl_b32 s0, s0, s1 ; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: s_lshl_b32 s2, s2, s1 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v2, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: v_or_b32_e32 v2, s2, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -295,8 +295,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s1 ; GFX10-NEXT: s_and_b32 s1, s4, s1 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s1 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, s0, v3, v2 @@ -444,8 +444,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 ; GFX10-NEXT: s_and_b32 s0, s2, s0 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v0, s0 -; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_xor_b32_e32 v4, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, v3, v4, v2 @@ -484,10 +484,10 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_not_b32 s0, s0 ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 +; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, 0 -; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -502,10 +502,10 @@ ; GFX7-NEXT: s_lshl_b32 s0, s0, s1 ; GFX7-NEXT: s_not_b32 s0, s0 ; GFX7-NEXT: v_lshlrev_b32_e32 v2, s1, v1 +; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_and_b32_e32 v3, s0, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, 0 -; GFX7-NEXT: v_mov_b32_e32 v1, 0 ; GFX7-NEXT: v_or_b32_e32 v2, v3, v2 ; GFX7-NEXT: flat_store_dword v[0:1], v2 ; GFX7-NEXT: s_endpgm @@ -591,8 +591,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v1, v0, s0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 -; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_xor_b32_e32 v3, -1, v1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_and_or_b32 v2, v4, v3, v2 @@ -746,12 +746,12 @@ ; GFX10-NEXT: s_lshl_b32 s0, s0, s3 ; GFX10-NEXT: s_lshl_b32 s2, s2, s3 ; GFX10-NEXT: s_not_b32 s0, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v2, v0, v1, vcc_lo ; GFX10-NEXT: v_and_or_b32 v4, v2, s0, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, s1, 0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v4, s0 ; GFX10-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -782,8 +782,8 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -808,8 +808,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -834,8 +834,8 @@ ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc -; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_cmp_eq_u32_e64 vcc, s2, 1 +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -849,6 +849,7 @@ ; GFX10-NEXT: s_cmp_eq_u32 s2, 1 ; GFX10-NEXT: v_and_b32_e32 v2, s5, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 0 +; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s3, s1, s0 ; GFX10-NEXT: s_and_b32 s4, s4, 1 @@ -859,7 +860,6 @@ ; GFX10-NEXT: s_andn2_b32 s3, s3, s5 ; GFX10-NEXT: v_lshl_or_b32 v4, v2, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s2, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo @@ -892,8 +892,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -920,8 +920,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -948,8 +948,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -969,9 +969,9 @@ ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s1 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v2, v3 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 @@ -1006,8 +1006,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX9-NEXT: global_store_dwordx2 v[2:3], v[0:1], off @@ -1033,8 +1033,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1061,8 +1061,8 @@ ; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v2 -; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v4, s[0:1] +; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc ; GFX7-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1081,8 +1081,8 @@ ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_cndmask_b32_e32 v5, s0, v1, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 0, v4 ; GFX10-NEXT: v_and_or_b32 v5, v5, v3, v2 @@ -1111,8 +1111,8 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX9-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX9-NEXT: v_mov_b32_e32 v3, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc @@ -1134,8 +1134,8 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v2, v2, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX8-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX8-NEXT: v_mov_b32_e32 v3, 0 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc @@ -1158,8 +1158,8 @@ ; GFX7-NEXT: v_lshl_b32_e32 v2, s0, v2 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v5 ; GFX7-NEXT: v_xor_b32_e32 v2, -1, v2 -; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v5 +; GFX7-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc @@ -1308,8 +1308,8 @@ ; GFX9-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX9-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc @@ -1330,8 +1330,8 @@ ; GFX8-NEXT: v_lshlrev_b32_e64 v3, v3, s0 ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX8-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc @@ -1354,8 +1354,8 @@ ; GFX7-NEXT: v_lshl_b32_e32 v3, s0, v3 ; GFX7-NEXT: v_cmp_eq_u32_e32 vcc, 1, v6 ; GFX7-NEXT: v_xor_b32_e32 v3, -1, v3 -; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[0:1], 0, v6 +; GFX7-NEXT: v_mov_b32_e32 v4, 0 ; GFX7-NEXT: v_mov_b32_e32 v5, 0 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_cndmask_b32_e32 v7, v0, v1, vcc @@ -1422,8 +1422,8 @@ ; GFX9-NEXT: s_cselect_b32 s2, s4, s2 ; GFX9-NEXT: s_cmp_eq_u32 s6, 3 ; GFX9-NEXT: s_cselect_b32 s3, s4, s3 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 @@ -1458,8 +1458,8 @@ ; GFX8-NEXT: s_cselect_b32 s2, s4, s2 ; GFX8-NEXT: s_cmp_eq_u32 s6, 3 ; GFX8-NEXT: s_cselect_b32 s3, s4, s3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 @@ -1493,8 +1493,8 @@ ; GFX7-NEXT: s_cselect_b32 s2, s4, s2 ; GFX7-NEXT: s_cmp_eq_u32 s6, 3 ; GFX7-NEXT: s_cselect_b32 s3, s4, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_mov_b64 s[4:5], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 @@ -1527,14 +1527,14 @@ ; GFX10-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10-NEXT: s_cselect_b32 s0, s4, s0 ; GFX10-NEXT: s_cmp_eq_u32 s6, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 ; GFX10-NEXT: s_cmp_eq_u32 s6, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_cselect_b32 s2, s4, s2 ; GFX10-NEXT: s_cmp_eq_u32 s6, 3 -; GFX10-NEXT: s_cselect_b32 s3, s4, s3 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: s_cselect_b32 s3, s4, s3 ; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX10-NEXT: s_endpgm @@ -1647,6 +1647,7 @@ ; GFX10-NEXT: s_lshl_b32 s3, s1, 4 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, s4, 3 ; GFX10-NEXT: s_mov_b32 s5, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_and_b32 s2, s2, s5 ; GFX10-NEXT: s_lshl_b32 s5, s5, s3 ; GFX10-NEXT: s_lshl_b32 s2, s2, s3 @@ -1658,7 +1659,6 @@ ; GFX10-NEXT: v_and_or_b32 v6, v4, s3, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, s4, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v6, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s0 @@ -1701,8 +1701,8 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -1737,8 +1737,8 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 vcc, s5, 3 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v6, vcc ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1789,6 +1789,7 @@ ; GFX10-NEXT: s_cmp_eq_u32 s5, 1 ; GFX10-NEXT: v_and_b32_e32 v4, s7, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s6, s1, s0 ; GFX10-NEXT: s_cmp_eq_u32 s5, 2 @@ -1805,7 +1806,6 @@ ; GFX10-NEXT: s_andn2_b32 s6, s6, s7 ; GFX10-NEXT: v_lshl_or_b32 v6, v4, s4, s6 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 -; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v6, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s5, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc_lo @@ -1850,10 +1850,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s10 ; GFX9-NEXT: v_mov_b32_e32 v3, s11 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -1888,10 +1888,10 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s10 ; GFX8-NEXT: v_mov_b32_e32 v3, s11 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1953,13 +1953,13 @@ ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v2 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v0, s8, v0, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v0, s11, s1 ; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -2003,10 +2003,10 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -2040,10 +2040,10 @@ ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v6, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[2:3] ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -2104,13 +2104,13 @@ ; GFX10-NEXT: v_xor_b32_e32 v5, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s4, v1, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s6, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v7, v1, s7, s1 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_and_or_b32 v7, v7, v5, v4 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 @@ -2141,8 +2141,8 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v7, 0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc @@ -2170,8 +2170,8 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v7, 0 ; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v9, v3, v4, vcc @@ -2224,6 +2224,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v2 ; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: s_and_b32 s1, s2, s0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 @@ -2239,7 +2240,6 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v6, s1 ; GFX10-NEXT: v_and_or_b32 v9, v2, v7, v0 ; GFX10-NEXT: v_mov_b32_e32 v7, 0 -; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v9, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v9, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v9, s0 @@ -2390,8 +2390,8 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc @@ -2418,8 +2418,8 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v0 ; GFX8-NEXT: v_xor_b32_e32 v1, -1, v1 -; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v0 +; GFX8-NEXT: v_mov_b32_e32 v8, 0 ; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc @@ -2472,6 +2472,7 @@ ; GFX10-NEXT: v_and_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 1, v3 ; GFX10-NEXT: s_mov_b32 s0, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 4, v0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v1 @@ -2481,7 +2482,6 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v0, v0, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v2, -1, v8 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v6, s0 @@ -2541,14 +2541,14 @@ ; GFX9-NEXT: s_cmp_eq_u32 s7, 5 ; GFX9-NEXT: s_cselect_b32 s5, s16, s13 ; GFX9-NEXT: s_cmp_eq_u32 s7, 6 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_cselect_b32 s6, s16, s14 ; GFX9-NEXT: s_cmp_eq_u32 s7, 7 +; GFX9-NEXT: s_cselect_b32 s7, s16, s15 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s3 -; GFX9-NEXT: s_cselect_b32 s7, s16, s15 ; GFX9-NEXT: global_store_dwordx4 v[4:5], v[0:3], off ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 @@ -2600,18 +2600,18 @@ ; GFX8-NEXT: s_cmp_eq_u32 s7, 5 ; GFX8-NEXT: s_cselect_b32 s5, s16, s13 ; GFX8-NEXT: s_cmp_eq_u32 s7, 6 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_cselect_b32 s6, s16, s14 ; GFX8-NEXT: s_cmp_eq_u32 s7, 7 +; GFX8-NEXT: s_cselect_b32 s7, s16, s15 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 -; GFX8-NEXT: s_cselect_b32 s7, s16, s15 ; GFX8-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GFX8-NEXT: v_mov_b32_e32 v4, 16 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s7 @@ -2658,16 +2658,16 @@ ; GFX7-NEXT: s_cmp_eq_u32 s7, 5 ; GFX7-NEXT: s_cselect_b32 s5, s16, s13 ; GFX7-NEXT: s_cmp_eq_u32 s7, 6 -; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: s_cselect_b32 s6, s16, s14 ; GFX7-NEXT: s_cmp_eq_u32 s7, 7 +; GFX7-NEXT: s_cselect_b32 s7, s16, s15 ; GFX7-NEXT: s_mov_b64 s[8:9], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_mov_b32 s10, -1 ; GFX7-NEXT: s_mov_b32 s11, 0xf000 -; GFX7-NEXT: s_cselect_b32 s7, s16, s15 ; GFX7-NEXT: buffer_store_dwordx4 v[0:3], off, s[8:11], 0 ; GFX7-NEXT: s_mov_b64 s[8:9], 16 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 @@ -2710,28 +2710,28 @@ ; GFX10-NEXT: s_cmp_eq_u32 s7, 0 ; GFX10-NEXT: s_cselect_b32 s0, s16, s8 ; GFX10-NEXT: s_cmp_eq_u32 s7, 1 +; GFX10-NEXT: v_mov_b32_e32 v0, s0 ; GFX10-NEXT: s_cselect_b32 s1, s16, s9 ; GFX10-NEXT: s_cmp_eq_u32 s7, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_cselect_b32 s2, s16, s10 ; GFX10-NEXT: s_cmp_eq_u32 s7, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_cselect_b32 s3, s16, s11 ; GFX10-NEXT: s_cmp_eq_u32 s7, 4 -; GFX10-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_cselect_b32 s4, s16, s12 ; GFX10-NEXT: s_cmp_eq_u32 s7, 5 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-NEXT: v_mov_b32_e32 v4, s4 ; GFX10-NEXT: s_cselect_b32 s5, s16, s13 ; GFX10-NEXT: s_cmp_eq_u32 s7, 6 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 +; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: s_cselect_b32 s6, s16, s14 ; GFX10-NEXT: s_cmp_eq_u32 s7, 7 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 -; GFX10-NEXT: s_cselect_b32 s7, s16, s15 -; GFX10-NEXT: v_mov_b32_e32 v4, s4 -; GFX10-NEXT: v_mov_b32_e32 v5, s5 ; GFX10-NEXT: v_mov_b32_e32 v6, s6 -; GFX10-NEXT: v_mov_b32_e32 v7, s7 +; GFX10-NEXT: s_cselect_b32 s7, s16, s15 ; GFX10-NEXT: s_mov_b64 s[0:1], 16 +; GFX10-NEXT: v_mov_b32_e32 v7, s7 ; GFX10-NEXT: global_store_dwordx4 v[8:9], v[0:3], off ; GFX10-NEXT: global_store_dwordx4 v10, v[4:7], s[0:1] ; GFX10-NEXT: s_endpgm @@ -2810,6 +2810,7 @@ ; GFX8-NEXT: v_cmp_eq_u32_e64 s[6:7], s12, 5 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 +; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: s_waitcnt vmcnt(1) @@ -2833,7 +2834,6 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v6, v6, v8, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v7, v8, s[10:11] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GFX8-NEXT: s_endpgm @@ -3124,6 +3124,7 @@ ; GFX10-NEXT: s_cmp_eq_u32 s0, 1 ; GFX10-NEXT: v_and_b32_e32 v8, s3, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 0 +; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_cselect_b32 s1, s9, s8 @@ -3153,7 +3154,6 @@ ; GFX10-NEXT: s_andn2_b32 s1, s1, s3 ; GFX10-NEXT: v_lshl_or_b32 v10, v8, s2, s1 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 -; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s0, 1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v10, vcc_lo @@ -3384,22 +3384,22 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v2, s10 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_cndmask_b32_e32 v1, s8, v1, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v4, s12 +; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: v_mov_b32_e32 v6, s14 +; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s14, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v1, s15, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 -; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6 @@ -3580,11 +3580,11 @@ ; GFX7-NEXT: v_mov_b32_e32 v1, s13 ; GFX7-NEXT: v_mov_b32_e32 v2, s14 ; GFX7-NEXT: v_mov_b32_e32 v3, s15 +; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX7-NEXT: v_mov_b32_e32 v4, s16 ; GFX7-NEXT: v_mov_b32_e32 v5, s17 ; GFX7-NEXT: v_mov_b32_e32 v6, s18 ; GFX7-NEXT: v_mov_b32_e32 v7, s19 -; GFX7-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v8 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[12:13] ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v9, vcc ; GFX7-NEXT: v_cndmask_b32_e64 v2, v2, v9, s[0:1] @@ -3622,22 +3622,22 @@ ; GFX10-NEXT: v_xor_b32_e32 v9, -1, v3 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: v_mov_b32_e32 v4, s12 ; GFX10-NEXT: v_cndmask_b32_e32 v2, s8, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s13 +; GFX10-NEXT: v_mov_b32_e32 v6, s14 +; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s10, s0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s11, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s12, s2 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s13, s3 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s14, s4 ; GFX10-NEXT: v_cndmask_b32_e64 v11, v2, s15, s5 -; GFX10-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_and_or_b32 v11, v11, v9, v8 -; GFX10-NEXT: v_mov_b32_e32 v4, s12 -; GFX10-NEXT: v_mov_b32_e32 v5, s13 -; GFX10-NEXT: v_mov_b32_e32 v6, s14 -; GFX10-NEXT: v_mov_b32_e32 v7, s15 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 ; GFX10-NEXT: v_mov_b32_e32 v9, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v11, s6 @@ -3690,12 +3690,12 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] ; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] @@ -3739,15 +3739,15 @@ ; GFX8-NEXT: v_and_b32_e32 v1, v11, v1 ; GFX8-NEXT: v_or_b32_e32 v11, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -3810,7 +3810,7 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 ; GFX10-NEXT: s_mov_b32 s5, 0xffff -; GFX10-NEXT: v_mov_b32_e32 v14, 0 +; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: s_and_b32 s6, s2, s5 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 @@ -3819,6 +3819,7 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 s4, 5, v0 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 4, v2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 6, v0 +; GFX10-NEXT: v_mov_b32_e32 v14, 0 ; GFX10-NEXT: v_lshlrev_b32_e64 v11, v2, s5 ; GFX10-NEXT: v_cmp_eq_u32_e64 s5, 7, v0 ; GFX10-NEXT: v_lshlrev_b32_e64 v2, v2, s6 @@ -3835,7 +3836,6 @@ ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v10, s5 ; GFX10-NEXT: v_and_or_b32 v13, v1, v11, v2 ; GFX10-NEXT: v_mov_b32_e32 v11, 0 -; GFX10-NEXT: v_mov_b32_e32 v12, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v3, v13, s6 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v13, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v5, v13, s0 @@ -3885,12 +3885,12 @@ ; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] @@ -3934,15 +3934,15 @@ ; GFX8-NEXT: v_or_b32_e32 v11, v1, v0 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc +; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -4080,11 +4080,11 @@ ; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] ; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] ; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] @@ -4128,14 +4128,14 @@ ; GFX8-NEXT: v_or_b32_e32 v12, v1, v2 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] ; GFX8-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] ; GFX8-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] ; GFX8-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] ; GFX8-NEXT: v_mov_b32_e32 v8, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] +; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] -; GFX8-NEXT: v_mov_b32_e32 v9, 0 ; GFX8-NEXT: v_mov_b32_e32 v10, 16 ; GFX8-NEXT: v_mov_b32_e32 v11, 0 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -4198,6 +4198,7 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 ; GFX10-NEXT: s_mov_b32 s4, 0xffff +; GFX10-NEXT: v_mov_b32_e32 v13, 0 ; GFX10-NEXT: v_mov_b32_e32 v15, 0 ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 ; GFX10-NEXT: v_cmp_eq_u32_e64 s0, 2, v0 @@ -4212,7 +4213,6 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_0 ; GFX10-NEXT: v_xor_b32_e32 v3, -1, v12 ; GFX10-NEXT: v_mov_b32_e32 v12, 0 -; GFX10-NEXT: v_mov_b32_e32 v13, 0 ; GFX10-NEXT: s_waitcnt vmcnt(1) ; GFX10-NEXT: v_cndmask_b32_e32 v1, v4, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, v6, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i8.ll @@ -76,10 +76,10 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s4, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s4, s0 ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off @@ -157,10 +157,10 @@ ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: v_lshrrev_b32_e32 v1, 8, v0 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v0, s2, s1 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, s0 ; GFX10-NEXT: s_movk_i32 s0, 0xff ; GFX10-NEXT: v_and_b32_sdwa v3, v1, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off @@ -327,9 +327,9 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v2, 8, v1 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_and_b32_sdwa v2, v2, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e64 v3, v1, s4, vcc_lo -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v3, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off @@ -410,10 +410,10 @@ ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 8, v2 ; GFX10-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_b32_sdwa v3, v3, s0 dst_sel:BYTE_1 dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:DWORD ; GFX10-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or_b32_sdwa v2, v2, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX10-NEXT: global_store_short v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -1451,10 +1451,10 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v2, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_or_b32 v2, v0, s1, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v2, v4, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -1744,10 +1744,10 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v2, v3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v3, 24, v0 ; GFX10-NEXT: v_lshlrev_b32_sdwa v4, v1, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_and_or_b32 v2, 0xff, v0, v2 ; GFX10-NEXT: v_lshlrev_b32_e32 v3, 24, v3 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v2, v4, v3 ; GFX10-NEXT: global_store_dword v[0:1], v2, off ; GFX10-NEXT: s_endpgm @@ -1955,8 +1955,8 @@ ; GFX7-NEXT: s_or_b32 s3, s4, s3 ; GFX7-NEXT: s_lshl_b32 s4, s5, 24 ; GFX7-NEXT: s_or_b32 s3, s3, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_mov_b32 s2, -1 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 @@ -2008,26 +2008,26 @@ ; GFX10-NEXT: s_cselect_b32 s1, s4, s1 ; GFX10-NEXT: s_bfe_u32 s7, s0, s3 ; GFX10-NEXT: s_bfe_u32 s3, s1, s3 +; GFX10-NEXT: s_lshr_b32 s4, s0, 24 ; GFX10-NEXT: s_and_b32 s5, s0, s2 +; GFX10-NEXT: s_bfe_u32 s0, s0, s6 ; GFX10-NEXT: s_lshr_b32 s8, s1, 24 +; GFX10-NEXT: s_lshl_b32 s7, s7, 8 ; GFX10-NEXT: s_and_b32 s2, s1, s2 ; GFX10-NEXT: s_lshl_b32 s3, s3, 8 ; GFX10-NEXT: s_bfe_u32 s1, s1, s6 -; GFX10-NEXT: s_lshr_b32 s4, s0, 24 -; GFX10-NEXT: s_bfe_u32 s0, s0, s6 -; GFX10-NEXT: s_lshl_b32 s7, s7, 8 -; GFX10-NEXT: s_or_b32 s2, s2, s3 -; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_lshl_b32 s0, s0, 16 ; GFX10-NEXT: s_or_b32 s5, s5, s7 -; GFX10-NEXT: s_or_b32 s1, s2, s1 -; GFX10-NEXT: s_lshl_b32 s2, s8, 24 +; GFX10-NEXT: s_or_b32 s2, s2, s3 +; GFX10-NEXT: s_lshl_b32 s1, s1, 16 ; GFX10-NEXT: s_or_b32 s0, s5, s0 ; GFX10-NEXT: s_lshl_b32 s3, s4, 24 -; GFX10-NEXT: s_or_b32 s1, s1, s2 +; GFX10-NEXT: s_or_b32 s1, s2, s1 +; GFX10-NEXT: s_lshl_b32 s2, s8, 24 ; GFX10-NEXT: s_or_b32 s0, s0, s3 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 +; GFX10-NEXT: s_or_b32 s1, s1, s2 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: global_store_dwordx2 v[0:1], v[2:3], off ; GFX10-NEXT: s_endpgm %vec = load <8 x i8>, <8 x i8> addrspace(4)* %ptr @@ -2515,9 +2515,9 @@ ; GFX10-NEXT: v_lshlrev_b32_sdwa v5, s0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v7, s1, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX10-NEXT: v_lshlrev_b32_e32 v2, 24, v2 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX10-NEXT: v_and_or_b32 v5, v1, s2, v5 -; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_or3_b32 v2, v3, v6, v2 ; GFX10-NEXT: v_or3_b32 v3, v5, v7, v4 @@ -4096,8 +4096,8 @@ ; GFX7-NEXT: s_or_b32 s2, s2, s3 ; GFX7-NEXT: s_lshl_b32 s3, s9, 24 ; GFX7-NEXT: s_or_b32 s7, s2, s3 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b64 s[0:1], 0 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: v_mov_b32_e32 v2, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s7 @@ -4799,18 +4799,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 @@ -4991,22 +4991,22 @@ ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 0 ; GFX10-NEXT: s_cselect_b32 s6, s1, s0 ; GFX10-NEXT: s_cmp_eq_u32 s7, 2 +; GFX10-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-NEXT: s_cselect_b32 s6, s2, s6 ; GFX10-NEXT: s_cmp_eq_u32 s7, 3 +; GFX10-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-NEXT: s_cselect_b32 s6, s3, s6 ; GFX10-NEXT: s_and_b32 s4, s4, 3 +; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_lshl_b32 s4, s4, 3 +; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: s_lshl_b32 s8, s5, s4 ; GFX10-NEXT: s_andn2_b32 s6, s6, s8 ; GFX10-NEXT: v_lshl_or_b32 v4, v0, s4, s6 ; GFX10-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-NEXT: v_mov_b32_e32 v1, s1 -; GFX10-NEXT: v_mov_b32_e32 v2, s2 -; GFX10-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-NEXT: s_mov_b32 s0, 8 ; GFX10-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 1 -; GFX10-NEXT: s_mov_b32 s1, 16 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s0, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v4, vcc_lo ; GFX10-NEXT: v_cmp_eq_u32_e64 vcc_lo, s7, 2 @@ -5034,8 +5034,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 ; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -5248,18 +5248,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 @@ -5449,6 +5449,7 @@ ; GFX10-NEXT: s_or_b32 s11, s1, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 ; GFX10-NEXT: s_and_b32 s2, s4, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 ; GFX10-NEXT: v_lshlrev_b32_e64 v0, v0, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s11, s1 @@ -5456,18 +5457,17 @@ ; GFX10-NEXT: v_mov_b32_e32 v0, s8 ; GFX10-NEXT: v_mov_b32_e32 v1, s9 ; GFX10-NEXT: v_mov_b32_e32 v2, s10 -; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: s_mov_b32 s2, 16 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v6, v0, s5, v6 @@ -5484,8 +5484,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 ; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -5563,14 +5563,14 @@ ; GFX9-NEXT: v_and_or_b32 v5, v2, v1, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 -; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX9-NEXT: s_mov_b32 s8, 8 ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX9-NEXT: s_mov_b32 s9, 16 ; GFX9-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v8, s8, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 +; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX9-NEXT: v_and_or_b32 v8, v0, s10, v8 ; GFX9-NEXT: v_lshlrev_b32_sdwa v0, s9, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -5670,8 +5670,6 @@ ; GFX8-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 -; GFX8-NEXT: v_mov_b32_e32 v2, s6 -; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX8-NEXT: v_mov_b32_e32 v8, 8 @@ -5683,6 +5681,8 @@ ; GFX8-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX8-NEXT: v_or_b32_e32 v0, v9, v0 ; GFX8-NEXT: v_lshlrev_b32_e32 v4, 24, v4 +; GFX8-NEXT: v_mov_b32_e32 v2, s6 +; GFX8-NEXT: v_mov_b32_e32 v3, s7 ; GFX8-NEXT: v_or_b32_e32 v0, v0, v4 ; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v8, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v5, s[0:1] @@ -5696,18 +5696,18 @@ ; GFX8-NEXT: v_mov_b32_e32 v4, 8 ; GFX8-NEXT: v_lshlrev_b32_sdwa v5, v4, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_mov_b32_e32 v8, 16 +; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX8-NEXT: v_lshrrev_b32_e32 v6, 24, v2 +; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 ; GFX8-NEXT: v_or_b32_sdwa v5, v2, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v2, v8, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_lshlrev_b32_sdwa v4, v4, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX8-NEXT: v_lshrrev_b32_e32 v7, 24, v3 -; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 -; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_sdwa v4, v3, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_lshlrev_b32_sdwa v3, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 -; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 +; GFX8-NEXT: v_or_b32_e32 v2, v5, v2 +; GFX8-NEXT: v_lshlrev_b32_e32 v5, 24, v6 ; GFX8-NEXT: v_or_b32_e32 v3, v4, v3 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v7 +; GFX8-NEXT: v_or_b32_e32 v2, v2, v5 ; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v6 @@ -5782,8 +5782,6 @@ ; GFX7-NEXT: v_or_b32_e32 v5, v1, v0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-NEXT: v_mov_b32_e32 v2, s6 -; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_cmp_eq_u32_e64 s[4:5], 0, v4 ; GFX7-NEXT: v_cndmask_b32_e64 v0, v0, v5, s[4:5] ; GFX7-NEXT: v_bfe_u32 v9, v0, 8, 8 @@ -5794,6 +5792,8 @@ ; GFX7-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc ; GFX7-NEXT: v_or_b32_e32 v8, v8, v9 ; GFX7-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; GFX7-NEXT: v_mov_b32_e32 v2, s6 +; GFX7-NEXT: v_mov_b32_e32 v3, s7 ; GFX7-NEXT: v_or_b32_e32 v0, v8, v0 ; GFX7-NEXT: v_lshlrev_b32_e32 v4, 24, v4 ; GFX7-NEXT: v_bfe_u32 v8, v1, 8, 8 @@ -5897,24 +5897,24 @@ ; GFX10-NEXT: s_or_b32 s7, s1, s2 ; GFX10-NEXT: v_cmp_eq_u32_e64 s1, 3, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s6, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: v_cmp_eq_u32_e64 s2, 0, v4 ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s7, s1 ; GFX10-NEXT: v_and_or_b32 v5, v2, v1, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s6 -; GFX10-NEXT: v_mov_b32_e32 v3, s7 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, v5, s2 ; GFX10-NEXT: v_cndmask_b32_e32 v1, v1, v5, vcc_lo ; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, v5, s0 -; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, v5, s1 ; GFX10-NEXT: s_mov_b32 s2, 16 +; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 ; GFX10-NEXT: v_lshrrev_b32_e32 v4, 24, v0 ; GFX10-NEXT: v_lshrrev_b32_e32 v5, 24, v1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v6, s3, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshrrev_b32_e32 v7, 24, v2 -; GFX10-NEXT: v_lshrrev_b32_e32 v8, 24, v3 +; GFX10-NEXT: v_lshlrev_b32_sdwa v9, s3, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v11, v10, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_lshlrev_b32_sdwa v10, v10, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX10-NEXT: v_and_or_b32 v6, v0, s8, v6 @@ -5931,8 +5931,8 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v8, 24, v8 ; GFX10-NEXT: v_or3_b32 v0, v6, v0, v4 ; GFX10-NEXT: v_or3_b32 v1, v9, v1, v5 -; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_or3_b32 v2, v11, v2, v7 +; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0 ; GFX10-NEXT: v_or3_b32 v3, v10, v3, v8 ; GFX10-NEXT: global_store_dwordx4 v[4:5], v[0:3], off @@ -6742,7 +6742,6 @@ ; GFX9-NEXT: v_lshlrev_b32_sdwa v15, v1, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v17, v1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 ; GFX9-NEXT: v_lshlrev_b32_sdwa v1, v1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_1 -; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_lshlrev_b32_sdwa v14, v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v16, v8, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 ; GFX9-NEXT: v_lshlrev_b32_sdwa v18, v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_2 @@ -6755,6 +6754,7 @@ ; GFX9-NEXT: v_lshlrev_b32_e32 v11, 24, v11 ; GFX9-NEXT: v_and_or_b32 v13, v2, v0, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v12, 24, v12 +; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_or3_b32 v0, v3, v14, v7 ; GFX9-NEXT: v_or3_b32 v1, v9, v16, v10 @@ -6835,7 +6835,6 @@ ; GFX8-NEXT: v_or_b32_sdwa v1, v1, v14 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v3, v3, v16 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: v_or_b32_sdwa v0, v0, v9 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD -; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_lshlrev_b32_e32 v6, 24, v6 ; GFX8-NEXT: v_lshlrev_b32_e32 v7, 24, v7 ; GFX8-NEXT: v_lshlrev_b32_e32 v8, 24, v8 @@ -6844,6 +6843,7 @@ ; GFX8-NEXT: v_or_b32_e32 v1, v1, v15 ; GFX8-NEXT: v_or_b32_e32 v3, v3, v17 ; GFX8-NEXT: v_or_b32_e32 v10, v0, v10 +; GFX8-NEXT: v_mov_b32_e32 v4, 0 ; GFX8-NEXT: v_mov_b32_e32 v5, 0 ; GFX8-NEXT: v_or_b32_e32 v0, v2, v6 ; GFX8-NEXT: v_or_b32_e32 v1, v1, v7 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.ll @@ -100,35 +100,35 @@ ; GPRIDX-LABEL: dyn_insertelement_v8f32_const_s_v_v: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GPRIDX-NEXT: s_mov_b32 s11, 0x41000000 -; GPRIDX-NEXT: s_mov_b32 s10, 0x40e00000 -; GPRIDX-NEXT: s_mov_b32 s9, 0x40c00000 -; GPRIDX-NEXT: s_mov_b32 s8, 0x40a00000 -; GPRIDX-NEXT: s_mov_b32 s7, 4.0 -; GPRIDX-NEXT: s_mov_b32 s6, 0x40400000 -; GPRIDX-NEXT: s_mov_b32 s5, 2.0 ; GPRIDX-NEXT: s_mov_b32 s4, 1.0 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s11 +; GPRIDX-NEXT: s_mov_b32 s5, 2.0 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s4 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GPRIDX-NEXT: s_mov_b32 s6, 0x40400000 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s5 ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GPRIDX-NEXT: s_mov_b32 s7, 4.0 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GPRIDX-NEXT: s_mov_b32 s8, 0x40a00000 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s7 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GPRIDX-NEXT: s_mov_b32 s9, 0x40c00000 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s8 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 +; GPRIDX-NEXT: s_mov_b32 s10, 0x40e00000 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s9 ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 +; GPRIDX-NEXT: s_mov_b32 s11, 0x41000000 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s10 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s11 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc @@ -140,29 +140,29 @@ ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 -; MOVREL-NEXT: s_mov_b32 s11, 0x41000000 ; MOVREL-NEXT: s_mov_b32 s4, 1.0 -; MOVREL-NEXT: s_mov_b32 s10, 0x40e00000 -; MOVREL-NEXT: s_mov_b32 s9, 0x40c00000 -; MOVREL-NEXT: s_mov_b32 s8, 0x40a00000 -; MOVREL-NEXT: s_mov_b32 s7, 4.0 -; MOVREL-NEXT: s_mov_b32 s6, 0x40400000 -; MOVREL-NEXT: s_mov_b32 s5, 2.0 -; MOVREL-NEXT: v_mov_b32_e32 v15, s11 -; MOVREL-NEXT: v_mov_b32_e32 v8, s4 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; MOVREL-NEXT: v_mov_b32_e32 v8, s4 +; MOVREL-NEXT: s_mov_b32 s5, 2.0 +; MOVREL-NEXT: s_mov_b32 s6, 0x40400000 ; MOVREL-NEXT: v_mov_b32_e32 v9, s5 ; MOVREL-NEXT: v_mov_b32_e32 v10, s6 -; MOVREL-NEXT: v_mov_b32_e32 v11, s7 -; MOVREL-NEXT: v_mov_b32_e32 v12, s8 ; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; MOVREL-NEXT: v_mov_b32_e32 v13, s9 -; MOVREL-NEXT: v_mov_b32_e32 v14, s10 +; MOVREL-NEXT: s_mov_b32 s7, 4.0 +; MOVREL-NEXT: s_mov_b32 s8, 0x40a00000 +; MOVREL-NEXT: v_mov_b32_e32 v11, s7 +; MOVREL-NEXT: v_mov_b32_e32 v12, s8 ; MOVREL-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; MOVREL-NEXT: s_mov_b32 s9, 0x40c00000 +; MOVREL-NEXT: s_mov_b32 s10, 0x40e00000 +; MOVREL-NEXT: v_mov_b32_e32 v13, s9 +; MOVREL-NEXT: v_mov_b32_e32 v14, s10 ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; MOVREL-NEXT: s_mov_b32 s11, 0x41000000 +; MOVREL-NEXT: v_mov_b32_e32 v15, s11 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo @@ -183,36 +183,36 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_s_v(<8 x float> inreg %vec, float inreg %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_s_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s10 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v0 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v9, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v0 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v0 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v0 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v0 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v0 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v7, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v0 ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v15, v7, vcc @@ -221,29 +221,29 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_s_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v15, s7 -; MOVREL-NEXT: v_mov_b32_e32 v8, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; MOVREL-NEXT: v_mov_b32_e32 v8, s0 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: v_mov_b32_e32 v9, s1 ; MOVREL-NEXT: v_mov_b32_e32 v10, s2 -; MOVREL-NEXT: v_mov_b32_e32 v11, s3 -; MOVREL-NEXT: v_mov_b32_e32 v12, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v8, v8, s10, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v0 -; MOVREL-NEXT: v_mov_b32_e32 v13, s5 -; MOVREL-NEXT: v_mov_b32_e32 v14, s6 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: v_mov_b32_e32 v11, s3 +; MOVREL-NEXT: v_mov_b32_e32 v12, s4 ; MOVREL-NEXT: v_cndmask_b32_e64 v1, v9, s10, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v0 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; MOVREL-NEXT: v_mov_b32_e32 v14, s6 ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v10, s10, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: v_mov_b32_e32 v15, s7 ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v11, s10, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v0 ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v12, s10, vcc_lo @@ -263,35 +263,35 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_s(<8 x float> inreg %vec, float %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 0 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 1 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v9, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 2 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 3 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 4 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 5 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 6 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s10, 7 ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc @@ -300,29 +300,29 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v15, s7 -; MOVREL-NEXT: v_mov_b32_e32 v8, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 0 +; MOVREL-NEXT: v_mov_b32_e32 v8, s0 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: v_mov_b32_e32 v9, s1 ; MOVREL-NEXT: v_mov_b32_e32 v10, s2 -; MOVREL-NEXT: v_mov_b32_e32 v11, s3 -; MOVREL-NEXT: v_mov_b32_e32 v12, s4 ; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 1 -; MOVREL-NEXT: v_mov_b32_e32 v13, s5 -; MOVREL-NEXT: v_mov_b32_e32 v14, s6 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: v_mov_b32_e32 v11, s3 +; MOVREL-NEXT: v_mov_b32_e32 v12, s4 ; MOVREL-NEXT: v_cndmask_b32_e32 v1, v9, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 2 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; MOVREL-NEXT: v_mov_b32_e32 v14, s6 ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 3 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: v_mov_b32_e32 v15, s7 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s10, 4 ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo @@ -388,35 +388,35 @@ define amdgpu_ps <8 x float> @dyn_insertelement_v8f32_s_v_v(<8 x float> inreg %vec, float %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f32_s_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s7 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 7, v1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v15, v0, vcc @@ -426,29 +426,29 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f32_s_v_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: v_mov_b32_e32 v15, s7 -; MOVREL-NEXT: v_mov_b32_e32 v8, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; MOVREL-NEXT: v_mov_b32_e32 v8, s0 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: v_mov_b32_e32 v9, s1 ; MOVREL-NEXT: v_mov_b32_e32 v10, s2 -; MOVREL-NEXT: v_mov_b32_e32 v11, s3 -; MOVREL-NEXT: v_mov_b32_e32 v12, s4 ; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; MOVREL-NEXT: v_mov_b32_e32 v13, s5 -; MOVREL-NEXT: v_mov_b32_e32 v14, s6 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: v_mov_b32_e32 v11, s3 +; MOVREL-NEXT: v_mov_b32_e32 v12, s4 ; MOVREL-NEXT: v_cndmask_b32_e32 v9, v9, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; MOVREL-NEXT: v_mov_b32_e32 v14, s6 ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 +; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: v_mov_b32_e32 v15, s7 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v1 ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc_lo @@ -754,10 +754,11 @@ ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GPRIDX-NEXT: s_mov_b32 s18, 0 -; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0 -; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 ; GPRIDX-NEXT: s_mov_b32 s17, 0x401c0000 ; GPRIDX-NEXT: s_mov_b32 s16, s18 +; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 +; GPRIDX-NEXT: s_mov_b64 s[4:5], 1.0 +; GPRIDX-NEXT: s_mov_b32 s19, 0x40200000 ; GPRIDX-NEXT: s_mov_b32 s15, 0x40180000 ; GPRIDX-NEXT: s_mov_b32 s14, s18 ; GPRIDX-NEXT: s_mov_b32 s13, 0x40140000 @@ -765,11 +766,14 @@ ; GPRIDX-NEXT: s_mov_b64 s[10:11], 4.0 ; GPRIDX-NEXT: s_mov_b32 s9, 0x40080000 ; GPRIDX-NEXT: s_mov_b32 s8, s18 -; GPRIDX-NEXT: s_mov_b64 s[6:7], 2.0 ; GPRIDX-NEXT: v_mov_b32_e32 v3, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v4, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v5, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v6, s7 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s10 @@ -778,12 +782,8 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v12, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s18 ; GPRIDX-NEXT: v_mov_b32_e32 v18, s19 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[16:17], 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 2, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[6:7], 3, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[8:9], 4, v2 @@ -821,40 +821,40 @@ ; MOVREL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; MOVREL-NEXT: s_waitcnt_vscnt null, 0x0 ; MOVREL-NEXT: s_mov_b32 s18, 0 +; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0 ; MOVREL-NEXT: s_mov_b64 s[4:5], 1.0 -; MOVREL-NEXT: s_mov_b32 s19, 0x40200000 -; MOVREL-NEXT: s_mov_b32 s17, 0x401c0000 -; MOVREL-NEXT: s_mov_b32 s16, s18 +; MOVREL-NEXT: s_mov_b64 s[10:11], 4.0 +; MOVREL-NEXT: s_mov_b32 s9, 0x40080000 +; MOVREL-NEXT: s_mov_b32 s8, s18 ; MOVREL-NEXT: s_mov_b32 s15, 0x40180000 ; MOVREL-NEXT: s_mov_b32 s14, s18 ; MOVREL-NEXT: s_mov_b32 s13, 0x40140000 ; MOVREL-NEXT: s_mov_b32 s12, s18 -; MOVREL-NEXT: s_mov_b64 s[10:11], 4.0 -; MOVREL-NEXT: s_mov_b32 s9, 0x40080000 -; MOVREL-NEXT: s_mov_b32 s8, s18 -; MOVREL-NEXT: s_mov_b64 s[6:7], 2.0 ; MOVREL-NEXT: v_mov_b32_e32 v3, s4 ; MOVREL-NEXT: v_mov_b32_e32 v4, s5 ; MOVREL-NEXT: v_mov_b32_e32 v5, s6 ; MOVREL-NEXT: v_mov_b32_e32 v6, s7 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 +; MOVREL-NEXT: s_mov_b32 s19, 0x40200000 +; MOVREL-NEXT: s_mov_b32 s17, 0x401c0000 +; MOVREL-NEXT: s_mov_b32 s16, s18 ; MOVREL-NEXT: v_mov_b32_e32 v7, s8 ; MOVREL-NEXT: v_mov_b32_e32 v8, s9 ; MOVREL-NEXT: v_mov_b32_e32 v9, s10 ; MOVREL-NEXT: v_mov_b32_e32 v10, s11 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 3, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s10, 2, v2 ; MOVREL-NEXT: v_mov_b32_e32 v11, s12 ; MOVREL-NEXT: v_mov_b32_e32 v12, s13 ; MOVREL-NEXT: v_mov_b32_e32 v13, s14 ; MOVREL-NEXT: v_mov_b32_e32 v14, s15 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 4, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s7, 5, v2 ; MOVREL-NEXT: v_mov_b32_e32 v15, s16 ; MOVREL-NEXT: v_mov_b32_e32 v16, s17 ; MOVREL-NEXT: v_mov_b32_e32 v17, s18 ; MOVREL-NEXT: v_mov_b32_e32 v18, s19 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 1, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 3, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s10, 2, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 4, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s7, 5, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s8, 6, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s9, 7, v2 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo @@ -898,24 +898,22 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_s_v(<8 x double> inreg %vec, double inreg %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_s_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s11 @@ -940,6 +938,8 @@ ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[10:11], 7, v0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GPRIDX-NEXT: v_mov_b32_e32 v0, s19 +; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 ; GPRIDX-NEXT: v_cndmask_b32_e64 v1, v1, v17, s[12:13] ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v3, v17, vcc ; GPRIDX-NEXT: v_cndmask_b32_e64 v2, v2, v0, s[12:13] @@ -968,48 +968,48 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f64_s_s_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s15, s17 ; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 ; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: v_mov_b32_e32 v2, s1 +; MOVREL-NEXT: v_mov_b32_e32 v1, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 +; MOVREL-NEXT: v_mov_b32_e32 v4, s3 +; MOVREL-NEXT: v_mov_b32_e32 v3, s2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 ; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 ; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s11, s13 ; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s13, s15 ; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: v_mov_b32_e32 v16, s15 -; MOVREL-NEXT: v_mov_b32_e32 v2, s1 -; MOVREL-NEXT: v_mov_b32_e32 v1, s0 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v0 -; MOVREL-NEXT: v_mov_b32_e32 v15, s14 -; MOVREL-NEXT: v_mov_b32_e32 v14, s13 -; MOVREL-NEXT: v_mov_b32_e32 v13, s12 -; MOVREL-NEXT: v_mov_b32_e32 v12, s11 -; MOVREL-NEXT: v_mov_b32_e32 v11, s10 -; MOVREL-NEXT: v_mov_b32_e32 v10, s9 -; MOVREL-NEXT: v_mov_b32_e32 v9, s8 +; MOVREL-NEXT: s_mov_b32 s15, s17 ; MOVREL-NEXT: v_mov_b32_e32 v8, s7 ; MOVREL-NEXT: v_mov_b32_e32 v7, s6 ; MOVREL-NEXT: v_mov_b32_e32 v6, s5 ; MOVREL-NEXT: v_mov_b32_e32 v5, s4 -; MOVREL-NEXT: v_mov_b32_e32 v4, s3 -; MOVREL-NEXT: v_mov_b32_e32 v3, s2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v0 ; MOVREL-NEXT: v_cndmask_b32_e64 v1, v1, s18, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v2, v2, s19, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 2, v0 +; MOVREL-NEXT: v_mov_b32_e32 v12, s11 +; MOVREL-NEXT: v_mov_b32_e32 v11, s10 +; MOVREL-NEXT: v_mov_b32_e32 v10, s9 +; MOVREL-NEXT: v_mov_b32_e32 v9, s8 ; MOVREL-NEXT: v_cndmask_b32_e64 v3, v3, s18, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, s19, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 4, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 5, v0 +; MOVREL-NEXT: v_mov_b32_e32 v16, s15 +; MOVREL-NEXT: v_mov_b32_e32 v15, s14 +; MOVREL-NEXT: v_mov_b32_e32 v14, s13 +; MOVREL-NEXT: v_mov_b32_e32 v13, s12 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 6, v0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 7, v0 ; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, s18, s1 @@ -1049,22 +1049,22 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_s(<8 x double> inreg %vec, double %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v17, s15 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s13 @@ -1098,25 +1098,25 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f64_s_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s15, s17 ; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 ; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 ; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s11, s13 ; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s13, s15 ; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: v_mov_b32_e32 v17, s15 -; MOVREL-NEXT: v_mov_b32_e32 v2, s0 +; MOVREL-NEXT: s_mov_b32 s15, s17 ; MOVREL-NEXT: s_lshl_b32 m0, s18, 1 +; MOVREL-NEXT: v_mov_b32_e32 v17, s15 ; MOVREL-NEXT: v_mov_b32_e32 v16, s14 ; MOVREL-NEXT: v_mov_b32_e32 v15, s13 ; MOVREL-NEXT: v_mov_b32_e32 v14, s12 @@ -1203,26 +1203,32 @@ define amdgpu_ps void @dyn_insertelement_v8f64_s_v_v(<8 x double> inreg %vec, double %val, i32 %idx) { ; GPRIDX-LABEL: dyn_insertelement_v8f64_s_v_v: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 +; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 +; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 +; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 +; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 +; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 @@ -1231,12 +1237,6 @@ ; GPRIDX-NEXT: v_mov_b32_e32 v9, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s4 -; GPRIDX-NEXT: v_mov_b32_e32 v6, s3 -; GPRIDX-NEXT: v_mov_b32_e32 v5, s2 -; GPRIDX-NEXT: v_mov_b32_e32 v4, s1 -; GPRIDX-NEXT: v_mov_b32_e32 v3, s0 -; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v2 -; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[0:1], 2, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[2:3], 3, v2 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 s[4:5], 4, v2 @@ -1271,44 +1271,44 @@ ; ; MOVREL-LABEL: dyn_insertelement_v8f64_s_v_v: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s15, s17 ; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 ; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 ; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 ; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s12, s14 -; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: v_mov_b32_e32 v18, s15 -; MOVREL-NEXT: v_mov_b32_e32 v17, s14 -; MOVREL-NEXT: v_mov_b32_e32 v16, s13 -; MOVREL-NEXT: v_mov_b32_e32 v15, s12 -; MOVREL-NEXT: v_mov_b32_e32 v14, s11 -; MOVREL-NEXT: v_mov_b32_e32 v13, s10 -; MOVREL-NEXT: v_mov_b32_e32 v12, s9 -; MOVREL-NEXT: v_mov_b32_e32 v11, s8 -; MOVREL-NEXT: v_mov_b32_e32 v10, s7 -; MOVREL-NEXT: v_mov_b32_e32 v9, s6 -; MOVREL-NEXT: v_mov_b32_e32 v8, s5 -; MOVREL-NEXT: v_mov_b32_e32 v7, s4 +; MOVREL-NEXT: s_mov_b32 s11, s13 ; MOVREL-NEXT: v_mov_b32_e32 v6, s3 ; MOVREL-NEXT: v_mov_b32_e32 v5, s2 ; MOVREL-NEXT: v_mov_b32_e32 v4, s1 ; MOVREL-NEXT: v_mov_b32_e32 v3, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 +; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s13, s15 +; MOVREL-NEXT: s_mov_b32 s14, s16 +; MOVREL-NEXT: s_mov_b32 s15, s17 +; MOVREL-NEXT: v_mov_b32_e32 v10, s7 +; MOVREL-NEXT: v_mov_b32_e32 v9, s6 +; MOVREL-NEXT: v_mov_b32_e32 v8, s5 +; MOVREL-NEXT: v_mov_b32_e32 v7, s4 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 3, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s6, 2, v2 +; MOVREL-NEXT: v_mov_b32_e32 v14, s11 +; MOVREL-NEXT: v_mov_b32_e32 v13, s10 +; MOVREL-NEXT: v_mov_b32_e32 v12, s9 +; MOVREL-NEXT: v_mov_b32_e32 v11, s8 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s2, 4, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s3, 5, v2 +; MOVREL-NEXT: v_mov_b32_e32 v18, s15 +; MOVREL-NEXT: v_mov_b32_e32 v17, s14 +; MOVREL-NEXT: v_mov_b32_e32 v16, s13 +; MOVREL-NEXT: v_mov_b32_e32 v15, s12 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s4, 6, v2 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s5, 7, v2 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo @@ -2646,22 +2646,22 @@ define amdgpu_ps <16 x i32> @dyn_insertelement_v16i32_s_v_s(<16 x i32> inreg %vec, i32 %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v16i32_s_v_s: ; GPRIDX: ; %bb.0: ; %entry -; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: s_mov_b32 s7, s9 -; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: s_mov_b32 s11, s13 -; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: s_mov_b32 s0, s2 +; GPRIDX-NEXT: s_mov_b32 s1, s3 ; GPRIDX-NEXT: s_mov_b32 s2, s4 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: s_mov_b32 s4, s6 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: s_mov_b32 s6, s8 +; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 +; GPRIDX-NEXT: s_mov_b32 s9, s11 ; GPRIDX-NEXT: s_mov_b32 s10, s12 +; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 +; GPRIDX-NEXT: s_mov_b32 s13, s15 ; GPRIDX-NEXT: s_mov_b32 s14, s16 +; GPRIDX-NEXT: s_mov_b32 s15, s17 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s15 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s13 @@ -2701,25 +2701,25 @@ ; ; MOVREL-LABEL: dyn_insertelement_v16i32_s_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s15, s17 ; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v1, s0 ; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 ; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 ; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s11, s13 ; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s13, s15 ; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: v_mov_b32_e32 v16, s15 -; MOVREL-NEXT: v_mov_b32_e32 v1, s0 +; MOVREL-NEXT: s_mov_b32 s15, s17 ; MOVREL-NEXT: s_mov_b32 m0, s18 +; MOVREL-NEXT: v_mov_b32_e32 v16, s15 ; MOVREL-NEXT: v_mov_b32_e32 v15, s14 ; MOVREL-NEXT: v_mov_b32_e32 v14, s13 ; MOVREL-NEXT: v_mov_b32_e32 v13, s12 @@ -2801,6 +2801,8 @@ ; MOVREL-LABEL: dyn_insertelement_v16f32_s_v_s: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: v_mov_b32_e32 v16, v0 +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: s_mov_b32 s1, s3 ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 @@ -2816,8 +2818,6 @@ ; MOVREL-NEXT: s_mov_b32 s13, s15 ; MOVREL-NEXT: s_mov_b32 s14, s16 ; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: v_mov_b32_e32 v16, v0 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: s_mov_b32 m0, s18 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 @@ -2917,6 +2917,8 @@ ; MOVREL-LABEL: dyn_insertelement_v32f32_s_v_s: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: v_mov_b32_e32 v32, v0 +; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: s_mov_b32 s1, s3 ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 @@ -2948,8 +2950,6 @@ ; MOVREL-NEXT: s_mov_b32 s29, s31 ; MOVREL-NEXT: s_mov_b32 s31, s33 ; MOVREL-NEXT: s_mov_b32 s30, s32 -; MOVREL-NEXT: v_mov_b32_e32 v32, v0 -; MOVREL-NEXT: v_mov_b32_e32 v0, s0 ; MOVREL-NEXT: s_mov_b32 m0, s34 ; MOVREL-NEXT: v_mov_b32_e32 v1, s1 ; MOVREL-NEXT: v_mov_b32_e32 v2, s2 @@ -2992,40 +2992,40 @@ define amdgpu_ps <16 x i64> @dyn_insertelement_v16i64_s_v_s(<16 x i64> inreg %vec, i64 %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v16i64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 ; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 ; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 s16, s18 ; GPRIDX-NEXT: s_mov_b32 s17, s19 +; GPRIDX-NEXT: s_mov_b32 s18, s20 ; GPRIDX-NEXT: s_mov_b32 s19, s21 +; GPRIDX-NEXT: s_mov_b32 s20, s22 ; GPRIDX-NEXT: s_mov_b32 s21, s23 +; GPRIDX-NEXT: s_mov_b32 s22, s24 ; GPRIDX-NEXT: s_mov_b32 s23, s25 +; GPRIDX-NEXT: s_mov_b32 s24, s26 ; GPRIDX-NEXT: s_mov_b32 s25, s27 +; GPRIDX-NEXT: s_mov_b32 s26, s28 ; GPRIDX-NEXT: s_mov_b32 s27, s29 +; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s29, s31 ; GPRIDX-NEXT: s_mov_b32 s31, s33 -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s16, s18 -; GPRIDX-NEXT: s_mov_b32 s18, s20 -; GPRIDX-NEXT: s_mov_b32 s20, s22 -; GPRIDX-NEXT: s_mov_b32 s22, s24 -; GPRIDX-NEXT: s_mov_b32 s24, s26 -; GPRIDX-NEXT: s_mov_b32 s26, s28 -; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s30, s32 -; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: s_lshl_b32 s33, s34, 1 +; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: v_mov_b32_e32 v32, s30 ; GPRIDX-NEXT: v_mov_b32_e32 v31, s29 ; GPRIDX-NEXT: v_mov_b32_e32 v30, s28 @@ -3097,41 +3097,41 @@ ; ; MOVREL-LABEL: dyn_insertelement_v16i64_s_v_s: ; MOVREL: ; %bb.0: ; %entry -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: s_mov_b32 s7, s9 -; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: s_mov_b32 s11, s13 -; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: s_mov_b32 s15, s17 -; MOVREL-NEXT: s_mov_b32 s17, s19 -; MOVREL-NEXT: s_mov_b32 s19, s21 -; MOVREL-NEXT: s_mov_b32 s21, s23 -; MOVREL-NEXT: s_mov_b32 s23, s25 -; MOVREL-NEXT: s_mov_b32 s25, s27 -; MOVREL-NEXT: s_mov_b32 s27, s29 -; MOVREL-NEXT: s_mov_b32 s29, s31 -; MOVREL-NEXT: s_mov_b32 s31, s33 ; MOVREL-NEXT: s_mov_b32 s0, s2 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: s_mov_b32 s2, s4 +; MOVREL-NEXT: s_mov_b32 s3, s5 ; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 +; MOVREL-NEXT: s_mov_b32 s9, s11 ; MOVREL-NEXT: s_mov_b32 s10, s12 +; MOVREL-NEXT: s_mov_b32 s11, s13 ; MOVREL-NEXT: s_mov_b32 s12, s14 +; MOVREL-NEXT: s_mov_b32 s13, s15 ; MOVREL-NEXT: s_mov_b32 s14, s16 +; MOVREL-NEXT: s_mov_b32 s15, s17 ; MOVREL-NEXT: s_mov_b32 s16, s18 +; MOVREL-NEXT: s_mov_b32 s17, s19 ; MOVREL-NEXT: s_mov_b32 s18, s20 +; MOVREL-NEXT: s_mov_b32 s19, s21 ; MOVREL-NEXT: s_mov_b32 s20, s22 +; MOVREL-NEXT: s_mov_b32 s21, s23 ; MOVREL-NEXT: s_mov_b32 s22, s24 +; MOVREL-NEXT: s_mov_b32 s23, s25 ; MOVREL-NEXT: s_mov_b32 s24, s26 +; MOVREL-NEXT: s_mov_b32 s25, s27 ; MOVREL-NEXT: s_mov_b32 s26, s28 +; MOVREL-NEXT: s_mov_b32 s27, s29 ; MOVREL-NEXT: s_mov_b32 s28, s30 +; MOVREL-NEXT: s_mov_b32 s29, s31 +; MOVREL-NEXT: s_mov_b32 s31, s33 ; MOVREL-NEXT: s_mov_b32 s30, s32 -; MOVREL-NEXT: v_mov_b32_e32 v33, s31 -; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: s_lshl_b32 m0, s34, 1 +; MOVREL-NEXT: v_mov_b32_e32 v33, s31 ; MOVREL-NEXT: v_mov_b32_e32 v32, s30 ; MOVREL-NEXT: v_mov_b32_e32 v31, s29 ; MOVREL-NEXT: v_mov_b32_e32 v30, s28 @@ -3205,40 +3205,40 @@ define amdgpu_ps <16 x double> @dyn_insertelement_v16f64_s_v_s(<16 x double> inreg %vec, double %val, i32 inreg %idx) { ; GPRIDX-LABEL: dyn_insertelement_v16f64_s_v_s: ; GPRIDX: ; %bb.0: ; %entry +; GPRIDX-NEXT: s_mov_b32 s0, s2 ; GPRIDX-NEXT: s_mov_b32 s1, s3 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: s_mov_b32 s3, s5 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: s_mov_b32 s5, s7 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s7, s9 +; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 +; GPRIDX-NEXT: s_mov_b32 s10, s12 ; GPRIDX-NEXT: s_mov_b32 s11, s13 +; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 +; GPRIDX-NEXT: s_mov_b32 s14, s16 ; GPRIDX-NEXT: s_mov_b32 s15, s17 +; GPRIDX-NEXT: s_mov_b32 s16, s18 ; GPRIDX-NEXT: s_mov_b32 s17, s19 +; GPRIDX-NEXT: s_mov_b32 s18, s20 ; GPRIDX-NEXT: s_mov_b32 s19, s21 +; GPRIDX-NEXT: s_mov_b32 s20, s22 ; GPRIDX-NEXT: s_mov_b32 s21, s23 +; GPRIDX-NEXT: s_mov_b32 s22, s24 ; GPRIDX-NEXT: s_mov_b32 s23, s25 +; GPRIDX-NEXT: s_mov_b32 s24, s26 ; GPRIDX-NEXT: s_mov_b32 s25, s27 +; GPRIDX-NEXT: s_mov_b32 s26, s28 ; GPRIDX-NEXT: s_mov_b32 s27, s29 +; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s29, s31 ; GPRIDX-NEXT: s_mov_b32 s31, s33 -; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 -; GPRIDX-NEXT: s_mov_b32 s8, s10 -; GPRIDX-NEXT: s_mov_b32 s10, s12 -; GPRIDX-NEXT: s_mov_b32 s12, s14 -; GPRIDX-NEXT: s_mov_b32 s14, s16 -; GPRIDX-NEXT: s_mov_b32 s16, s18 -; GPRIDX-NEXT: s_mov_b32 s18, s20 -; GPRIDX-NEXT: s_mov_b32 s20, s22 -; GPRIDX-NEXT: s_mov_b32 s22, s24 -; GPRIDX-NEXT: s_mov_b32 s24, s26 -; GPRIDX-NEXT: s_mov_b32 s26, s28 -; GPRIDX-NEXT: s_mov_b32 s28, s30 ; GPRIDX-NEXT: s_mov_b32 s30, s32 -; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: s_lshl_b32 s33, s34, 1 +; GPRIDX-NEXT: v_mov_b32_e32 v33, s31 ; GPRIDX-NEXT: v_mov_b32_e32 v32, s30 ; GPRIDX-NEXT: v_mov_b32_e32 v31, s29 ; GPRIDX-NEXT: v_mov_b32_e32 v30, s28 @@ -3310,41 +3310,41 @@ ; ; MOVREL-LABEL: dyn_insertelement_v16f64_s_v_s: ; MOVREL: ; %bb.0: ; %entry +; MOVREL-NEXT: s_mov_b32 s0, s2 ; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 +; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 +; MOVREL-NEXT: s_mov_b32 s10, s12 ; MOVREL-NEXT: s_mov_b32 s11, s13 +; MOVREL-NEXT: s_mov_b32 s12, s14 ; MOVREL-NEXT: s_mov_b32 s13, s15 +; MOVREL-NEXT: s_mov_b32 s14, s16 ; MOVREL-NEXT: s_mov_b32 s15, s17 +; MOVREL-NEXT: s_mov_b32 s16, s18 ; MOVREL-NEXT: s_mov_b32 s17, s19 +; MOVREL-NEXT: s_mov_b32 s18, s20 ; MOVREL-NEXT: s_mov_b32 s19, s21 +; MOVREL-NEXT: s_mov_b32 s20, s22 ; MOVREL-NEXT: s_mov_b32 s21, s23 +; MOVREL-NEXT: s_mov_b32 s22, s24 ; MOVREL-NEXT: s_mov_b32 s23, s25 +; MOVREL-NEXT: s_mov_b32 s24, s26 ; MOVREL-NEXT: s_mov_b32 s25, s27 +; MOVREL-NEXT: s_mov_b32 s26, s28 ; MOVREL-NEXT: s_mov_b32 s27, s29 +; MOVREL-NEXT: s_mov_b32 s28, s30 ; MOVREL-NEXT: s_mov_b32 s29, s31 ; MOVREL-NEXT: s_mov_b32 s31, s33 -; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s8, s10 -; MOVREL-NEXT: s_mov_b32 s10, s12 -; MOVREL-NEXT: s_mov_b32 s12, s14 -; MOVREL-NEXT: s_mov_b32 s14, s16 -; MOVREL-NEXT: s_mov_b32 s16, s18 -; MOVREL-NEXT: s_mov_b32 s18, s20 -; MOVREL-NEXT: s_mov_b32 s20, s22 -; MOVREL-NEXT: s_mov_b32 s22, s24 -; MOVREL-NEXT: s_mov_b32 s24, s26 -; MOVREL-NEXT: s_mov_b32 s26, s28 -; MOVREL-NEXT: s_mov_b32 s28, s30 ; MOVREL-NEXT: s_mov_b32 s30, s32 -; MOVREL-NEXT: v_mov_b32_e32 v33, s31 -; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: s_lshl_b32 m0, s34, 1 +; MOVREL-NEXT: v_mov_b32_e32 v33, s31 ; MOVREL-NEXT: v_mov_b32_e32 v32, s30 ; MOVREL-NEXT: v_mov_b32_e32 v31, s29 ; MOVREL-NEXT: v_mov_b32_e32 v30, s28 @@ -3501,30 +3501,30 @@ ; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_s: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v7, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 0 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 1 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 2 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s3 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 3 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 4 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s5 ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v11, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 5 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v12, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e64 vcc, s9, 6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v13, v0, vcc @@ -3534,24 +3534,24 @@ ; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_s: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: v_mov_b32_e32 v13, s6 -; MOVREL-NEXT: v_mov_b32_e32 v7, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 0 +; MOVREL-NEXT: v_mov_b32_e32 v7, s0 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: v_mov_b32_e32 v8, s1 ; MOVREL-NEXT: v_mov_b32_e32 v9, s2 -; MOVREL-NEXT: v_mov_b32_e32 v10, s3 -; MOVREL-NEXT: v_mov_b32_e32 v11, s4 ; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 1 -; MOVREL-NEXT: v_mov_b32_e32 v12, s5 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: v_mov_b32_e32 v10, s3 +; MOVREL-NEXT: v_mov_b32_e32 v11, s4 ; MOVREL-NEXT: v_cndmask_b32_e32 v1, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 2 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: v_mov_b32_e32 v12, s5 +; MOVREL-NEXT: v_mov_b32_e32 v13, s6 ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v9, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s9, 3 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v10, v0, vcc_lo @@ -3572,30 +3572,30 @@ ; GPRIDX-LABEL: dyn_insertelement_v7f32_s_v_v: ; GPRIDX: ; %bb.0: ; %entry ; GPRIDX-NEXT: s_mov_b32 s0, s2 -; GPRIDX-NEXT: s_mov_b32 s2, s4 -; GPRIDX-NEXT: s_mov_b32 s4, s6 -; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: s_mov_b32 s1, s3 -; GPRIDX-NEXT: s_mov_b32 s3, s5 -; GPRIDX-NEXT: s_mov_b32 s5, s7 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v8, s0 ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 0, v1 +; GPRIDX-NEXT: s_mov_b32 s2, s4 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 1, v1 +; GPRIDX-NEXT: s_mov_b32 s3, s5 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s2 ; GPRIDX-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 2, v1 +; GPRIDX-NEXT: s_mov_b32 s4, s6 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s3 ; GPRIDX-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 3, v1 +; GPRIDX-NEXT: s_mov_b32 s5, s7 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s4 ; GPRIDX-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 4, v1 +; GPRIDX-NEXT: s_mov_b32 s6, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v13, s5 ; GPRIDX-NEXT: v_cndmask_b32_e32 v4, v12, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 5, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s6 ; GPRIDX-NEXT: v_cndmask_b32_e32 v5, v13, v0, vcc ; GPRIDX-NEXT: v_cmp_eq_u32_e32 vcc, 6, v1 ; GPRIDX-NEXT: v_cndmask_b32_e32 v6, v14, v0, vcc @@ -3606,24 +3606,24 @@ ; MOVREL-LABEL: dyn_insertelement_v7f32_s_v_v: ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_mov_b32 s0, s2 -; MOVREL-NEXT: s_mov_b32 s2, s4 -; MOVREL-NEXT: s_mov_b32 s4, s6 -; MOVREL-NEXT: s_mov_b32 s6, s8 -; MOVREL-NEXT: s_mov_b32 s1, s3 -; MOVREL-NEXT: s_mov_b32 s3, s5 -; MOVREL-NEXT: s_mov_b32 s5, s7 -; MOVREL-NEXT: v_mov_b32_e32 v14, s6 -; MOVREL-NEXT: v_mov_b32_e32 v8, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v1 +; MOVREL-NEXT: v_mov_b32_e32 v8, s0 +; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: v_mov_b32_e32 v9, s1 ; MOVREL-NEXT: v_mov_b32_e32 v10, s2 -; MOVREL-NEXT: v_mov_b32_e32 v11, s3 -; MOVREL-NEXT: v_mov_b32_e32 v12, s4 ; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 1, v1 -; MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: s_mov_b32 s4, s6 +; MOVREL-NEXT: v_mov_b32_e32 v11, s3 +; MOVREL-NEXT: v_mov_b32_e32 v12, s4 ; MOVREL-NEXT: v_cndmask_b32_e32 v7, v9, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v1 +; MOVREL-NEXT: s_mov_b32 s5, s7 +; MOVREL-NEXT: s_mov_b32 s6, s8 +; MOVREL-NEXT: v_mov_b32_e32 v13, s5 +; MOVREL-NEXT: v_mov_b32_e32 v14, s6 ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v10, v0, vcc_lo ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 3, v1 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v11, v0, vcc_lo @@ -3827,6 +3827,7 @@ ; MOVREL: ; %bb.0: ; %entry ; MOVREL-NEXT: s_mov_b32 s0, s2 ; MOVREL-NEXT: s_mov_b32 s1, s3 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 ; MOVREL-NEXT: s_mov_b32 s4, s6 @@ -3839,9 +3840,8 @@ ; MOVREL-NEXT: s_mov_b32 s11, s13 ; MOVREL-NEXT: s_mov_b32 s12, s14 ; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: v_mov_b32_e32 v17, s15 -; MOVREL-NEXT: v_mov_b32_e32 v2, s0 ; MOVREL-NEXT: s_lshl_b32 m0, s16, 1 +; MOVREL-NEXT: v_mov_b32_e32 v17, s15 ; MOVREL-NEXT: v_mov_b32_e32 v16, s14 ; MOVREL-NEXT: v_mov_b32_e32 v15, s13 ; MOVREL-NEXT: v_mov_b32_e32 v14, s12 @@ -3895,8 +3895,6 @@ ; GPRIDX-NEXT: s_mov_b32 s11, s13 ; GPRIDX-NEXT: s_mov_b32 s12, s14 ; GPRIDX-NEXT: s_mov_b32 s13, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 ; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 ; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 ; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 @@ -3946,6 +3944,8 @@ ; GPRIDX-NEXT: v_readfirstlane_b32 s11, v13 ; GPRIDX-NEXT: v_readfirstlane_b32 s12, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s13, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 +; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v7f64_s_v_v: @@ -3954,57 +3954,55 @@ ; MOVREL-NEXT: s_mov_b32 s1, s3 ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: v_mov_b32_e32 v5, s2 +; MOVREL-NEXT: v_mov_b32_e32 v6, s3 +; MOVREL-NEXT: v_mov_b32_e32 v4, s1 +; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 +; MOVREL-NEXT: v_mov_b32_e32 v9, s6 +; MOVREL-NEXT: v_mov_b32_e32 v10, s7 +; MOVREL-NEXT: v_mov_b32_e32 v8, s5 +; MOVREL-NEXT: v_mov_b32_e32 v7, s4 +; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo +; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 +; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 +; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 ; MOVREL-NEXT: s_mov_b32 s10, s12 ; MOVREL-NEXT: s_mov_b32 s11, s13 ; MOVREL-NEXT: s_mov_b32 s12, s14 ; MOVREL-NEXT: s_mov_b32 s13, s15 -; MOVREL-NEXT: v_mov_b32_e32 v18, s15 -; MOVREL-NEXT: v_mov_b32_e32 v17, s14 -; MOVREL-NEXT: v_mov_b32_e32 v16, s13 ; MOVREL-NEXT: v_mov_b32_e32 v15, s12 +; MOVREL-NEXT: v_mov_b32_e32 v16, s13 ; MOVREL-NEXT: v_mov_b32_e32 v14, s11 ; MOVREL-NEXT: v_mov_b32_e32 v13, s10 ; MOVREL-NEXT: v_mov_b32_e32 v12, s9 ; MOVREL-NEXT: v_mov_b32_e32 v11, s8 -; MOVREL-NEXT: v_mov_b32_e32 v10, s7 -; MOVREL-NEXT: v_mov_b32_e32 v9, s6 -; MOVREL-NEXT: v_mov_b32_e32 v8, s5 -; MOVREL-NEXT: v_mov_b32_e32 v7, s4 -; MOVREL-NEXT: v_mov_b32_e32 v6, s3 -; MOVREL-NEXT: v_mov_b32_e32 v5, s2 -; MOVREL-NEXT: v_mov_b32_e32 v4, s1 -; MOVREL-NEXT: v_mov_b32_e32 v3, s0 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 -; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo -; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 -; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 -; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 ; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v8, v8, v1, vcc_lo -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; MOVREL-NEXT: v_cndmask_b32_e64 v9, v9, v0, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 4, v2 ; MOVREL-NEXT: v_cndmask_b32_e64 v10, v10, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 5, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 6, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v5 ; MOVREL-NEXT: v_cndmask_b32_e32 v11, v11, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v12, v1, vcc_lo -; MOVREL-NEXT: v_readfirstlane_b32 s3, v6 ; MOVREL-NEXT: v_cndmask_b32_e64 v12, v13, v0, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v13, v14, v1, s0 ; MOVREL-NEXT: v_cndmask_b32_e64 v0, v15, v0, s1 ; MOVREL-NEXT: v_cndmask_b32_e64 v1, v16, v1, s1 ; MOVREL-NEXT: v_readfirstlane_b32 s0, v3 ; MOVREL-NEXT: v_readfirstlane_b32 s1, v4 +; MOVREL-NEXT: v_readfirstlane_b32 s3, v6 ; MOVREL-NEXT: v_readfirstlane_b32 s4, v7 ; MOVREL-NEXT: v_readfirstlane_b32 s5, v8 ; MOVREL-NEXT: v_readfirstlane_b32 s6, v9 @@ -4015,6 +4013,8 @@ ; MOVREL-NEXT: v_readfirstlane_b32 s11, v13 ; MOVREL-NEXT: v_readfirstlane_b32 s12, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s13, v1 +; MOVREL-NEXT: v_mov_b32_e32 v18, s15 +; MOVREL-NEXT: v_mov_b32_e32 v17, s14 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <7 x double> %vec, double %val, i32 %idx @@ -4201,12 +4201,6 @@ ; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v12, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v9, s7 @@ -4242,6 +4236,9 @@ ; GPRIDX-NEXT: v_readfirstlane_b32 s7, v9 ; GPRIDX-NEXT: v_readfirstlane_b32 s8, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s9, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s12 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v12, s10 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v5f64_s_v_s: @@ -4250,37 +4247,31 @@ ; MOVREL-NEXT: s_mov_b32 s1, s3 ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: v_mov_b32_e32 v4, s2 +; MOVREL-NEXT: v_mov_b32_e32 v5, s3 +; MOVREL-NEXT: v_mov_b32_e32 v3, s1 +; MOVREL-NEXT: v_mov_b32_e32 v2, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: v_mov_b32_e32 v17, s15 -; MOVREL-NEXT: v_mov_b32_e32 v16, s14 -; MOVREL-NEXT: v_mov_b32_e32 v15, s13 -; MOVREL-NEXT: v_mov_b32_e32 v14, s12 -; MOVREL-NEXT: v_mov_b32_e32 v13, s11 -; MOVREL-NEXT: v_mov_b32_e32 v12, s10 -; MOVREL-NEXT: v_mov_b32_e32 v11, s9 ; MOVREL-NEXT: v_mov_b32_e32 v10, s8 +; MOVREL-NEXT: v_mov_b32_e32 v11, s9 ; MOVREL-NEXT: v_mov_b32_e32 v9, s7 ; MOVREL-NEXT: v_mov_b32_e32 v8, s6 ; MOVREL-NEXT: v_mov_b32_e32 v7, s5 ; MOVREL-NEXT: v_mov_b32_e32 v6, s4 -; MOVREL-NEXT: v_mov_b32_e32 v5, s3 -; MOVREL-NEXT: v_mov_b32_e32 v4, s2 -; MOVREL-NEXT: v_mov_b32_e32 v3, s1 -; MOVREL-NEXT: v_mov_b32_e32 v2, s0 -; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 0 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 1 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v2, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v1, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v4, v4, v0, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 vcc_lo, s12, 2 ; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, s12, 3 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, s12, 4 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v4 ; MOVREL-NEXT: v_cndmask_b32_e32 v6, v6, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v1, vcc_lo @@ -4297,6 +4288,9 @@ ; MOVREL-NEXT: v_readfirstlane_b32 s7, v9 ; MOVREL-NEXT: v_readfirstlane_b32 s8, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s9, v1 +; MOVREL-NEXT: v_mov_b32_e32 v14, s12 +; MOVREL-NEXT: v_mov_b32_e32 v13, s11 +; MOVREL-NEXT: v_mov_b32_e32 v12, s10 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <5 x double> %vec, double %val, i32 %idx @@ -4316,12 +4310,6 @@ ; GPRIDX-NEXT: s_mov_b32 s7, s9 ; GPRIDX-NEXT: s_mov_b32 s8, s10 ; GPRIDX-NEXT: s_mov_b32 s9, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v18, s15 -; GPRIDX-NEXT: v_mov_b32_e32 v17, s14 -; GPRIDX-NEXT: v_mov_b32_e32 v16, s13 -; GPRIDX-NEXT: v_mov_b32_e32 v15, s12 -; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 -; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 ; GPRIDX-NEXT: v_mov_b32_e32 v12, s9 ; GPRIDX-NEXT: v_mov_b32_e32 v11, s8 ; GPRIDX-NEXT: v_mov_b32_e32 v10, s7 @@ -4357,6 +4345,8 @@ ; GPRIDX-NEXT: v_readfirstlane_b32 s7, v9 ; GPRIDX-NEXT: v_readfirstlane_b32 s8, v0 ; GPRIDX-NEXT: v_readfirstlane_b32 s9, v1 +; GPRIDX-NEXT: v_mov_b32_e32 v14, s11 +; GPRIDX-NEXT: v_mov_b32_e32 v13, s10 ; GPRIDX-NEXT: ; return to shader part epilog ; ; MOVREL-LABEL: dyn_insertelement_v5f64_s_v_v: @@ -4365,37 +4355,31 @@ ; MOVREL-NEXT: s_mov_b32 s1, s3 ; MOVREL-NEXT: s_mov_b32 s2, s4 ; MOVREL-NEXT: s_mov_b32 s3, s5 +; MOVREL-NEXT: v_mov_b32_e32 v5, s2 +; MOVREL-NEXT: v_mov_b32_e32 v6, s3 +; MOVREL-NEXT: v_mov_b32_e32 v4, s1 +; MOVREL-NEXT: v_mov_b32_e32 v3, s0 +; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 ; MOVREL-NEXT: s_mov_b32 s4, s6 ; MOVREL-NEXT: s_mov_b32 s5, s7 ; MOVREL-NEXT: s_mov_b32 s6, s8 ; MOVREL-NEXT: s_mov_b32 s7, s9 ; MOVREL-NEXT: s_mov_b32 s8, s10 ; MOVREL-NEXT: s_mov_b32 s9, s11 -; MOVREL-NEXT: v_mov_b32_e32 v18, s15 -; MOVREL-NEXT: v_mov_b32_e32 v17, s14 -; MOVREL-NEXT: v_mov_b32_e32 v16, s13 -; MOVREL-NEXT: v_mov_b32_e32 v15, s12 -; MOVREL-NEXT: v_mov_b32_e32 v14, s11 -; MOVREL-NEXT: v_mov_b32_e32 v13, s10 -; MOVREL-NEXT: v_mov_b32_e32 v12, s9 ; MOVREL-NEXT: v_mov_b32_e32 v11, s8 +; MOVREL-NEXT: v_mov_b32_e32 v12, s9 ; MOVREL-NEXT: v_mov_b32_e32 v10, s7 ; MOVREL-NEXT: v_mov_b32_e32 v9, s6 ; MOVREL-NEXT: v_mov_b32_e32 v8, s5 ; MOVREL-NEXT: v_mov_b32_e32 v7, s4 -; MOVREL-NEXT: v_mov_b32_e32 v6, s3 -; MOVREL-NEXT: v_mov_b32_e32 v5, s2 -; MOVREL-NEXT: v_mov_b32_e32 v4, s1 -; MOVREL-NEXT: v_mov_b32_e32 v3, s0 -; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 1, v2 -; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; MOVREL-NEXT: v_cndmask_b32_e32 v3, v3, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v4, v4, v1, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e64 v5, v5, v0, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e32 vcc_lo, 2, v2 ; MOVREL-NEXT: v_cndmask_b32_e64 v6, v6, v1, s0 ; MOVREL-NEXT: v_cmp_eq_u32_e64 s0, 3, v2 +; MOVREL-NEXT: v_cmp_eq_u32_e64 s1, 4, v2 ; MOVREL-NEXT: v_readfirstlane_b32 s2, v5 ; MOVREL-NEXT: v_cndmask_b32_e32 v7, v7, v0, vcc_lo ; MOVREL-NEXT: v_cndmask_b32_e32 v2, v8, v1, vcc_lo @@ -4412,6 +4396,8 @@ ; MOVREL-NEXT: v_readfirstlane_b32 s7, v9 ; MOVREL-NEXT: v_readfirstlane_b32 s8, v0 ; MOVREL-NEXT: v_readfirstlane_b32 s9, v1 +; MOVREL-NEXT: v_mov_b32_e32 v14, s11 +; MOVREL-NEXT: v_mov_b32_e32 v13, s10 ; MOVREL-NEXT: ; return to shader part epilog entry: %insert = insertelement <5 x double> %vec, double %val, i32 %idx diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.dec.ll @@ -984,6 +984,7 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -993,9 +994,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1006,6 +1006,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1015,9 +1016,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1067,8 +1067,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc @@ -1083,8 +1083,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc @@ -1187,9 +1187,9 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1203,9 +1203,9 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1238,9 +1238,9 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1254,9 +1254,9 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_dec_rtn_u64 v[0:1], v2, v[0:1] offset:32 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1543,6 +1543,7 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1552,9 +1553,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1565,6 +1565,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1574,9 +1575,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_atomic_dec_x2 v[2:3], v[4:5], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1626,8 +1626,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc @@ -1642,8 +1642,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.atomic.inc.ll @@ -642,9 +642,9 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -658,9 +658,9 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -707,9 +707,9 @@ ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: s_waitcnt lgkmcnt(0) +; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s2 ; CI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 -; CI-NEXT: v_mov_b32_e32 v3, s1 ; CI-NEXT: v_mov_b32_e32 v2, s0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -723,9 +723,9 @@ ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: ds_inc_rtn_u64 v[0:1], v2, v[0:1] offset:32 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] @@ -1082,6 +1082,7 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1091,9 +1092,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) @@ -1104,6 +1104,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1113,9 +1114,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1164,8 +1164,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc @@ -1180,8 +1180,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc @@ -1416,12 +1416,12 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v4 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v4 ; GFX9-NEXT: v_mov_b32_e32 v4, 42 ; GFX9-NEXT: flat_atomic_inc v0, v[0:1], v4 offset:20 glc +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX9-NEXT: flat_store_dword v[2:3], v0 @@ -1562,9 +1562,9 @@ ; ; GFX9-LABEL: atomic_inc_shl_base_lds_0_i64: ; GFX9: ; %bb.0: -; GFX9-NEXT: v_mov_b32_e32 v1, 9 ; GFX9-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GFX9-NEXT: v_add_u32_e32 v3, 2, v0 +; GFX9-NEXT: v_mov_b32_e32 v1, 9 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 3, v0 ; GFX9-NEXT: ds_inc_rtn_u64 v[0:1], v0, v[1:2] offset:16 @@ -1749,8 +1749,8 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_add_u32 s0, s0, 32 ; GFX10-NEXT: s_addc_u32 s1, s1, 0 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc ; GFX10-NEXT: s_endpgm %gep = getelementptr i64, i64* %ptr, i32 4 @@ -1763,6 +1763,7 @@ ; CI: ; %bb.0: ; CI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; CI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; CI-NEXT: v_mov_b32_e32 v3, 0 ; CI-NEXT: s_waitcnt lgkmcnt(0) ; CI-NEXT: v_mov_b32_e32 v0, s2 ; CI-NEXT: v_mov_b32_e32 v1, s3 @@ -1772,9 +1773,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v0, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_add_i32_e32 v4, vcc, 40, v4 -; CI-NEXT: v_mov_b32_e32 v3, 0 +; CI-NEXT: v_mov_b32_e32 v2, 42 ; CI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; CI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc ; CI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1785,6 +1785,7 @@ ; VI: ; %bb.0: ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: v_lshlrev_b32_e32 v2, 3, v0 +; VI-NEXT: v_mov_b32_e32 v3, 0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 @@ -1794,9 +1795,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_add_u32_e32 v4, vcc, 40, v4 -; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: v_mov_b32_e32 v2, 42 ; VI-NEXT: v_addc_u32_e32 v5, vcc, 0, v5, vcc ; VI-NEXT: flat_atomic_inc_x2 v[2:3], v[4:5], v[2:3] glc ; VI-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1815,8 +1815,8 @@ ; GFX9-NEXT: v_add_co_u32_e32 v0, vcc, v0, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v1, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[0:1], v[4:5] offset:40 glc -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v2, v6 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v3, vcc ; GFX9-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1837,8 +1837,8 @@ ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1863,8 +1863,8 @@ ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: v_add_i32_e32 v2, vcc, v0, v2 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_add_i32_e32 v2, vcc, 40, v2 +; CI-NEXT: v_mov_b32_e32 v0, 42 ; CI-NEXT: v_mov_b32_e32 v1, 0 ; CI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; CI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc @@ -1879,8 +1879,8 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_add_u32_e32 v2, vcc, v0, v2 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v1, vcc -; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_add_u32_e32 v2, vcc, 40, v2 +; VI-NEXT: v_mov_b32_e32 v0, 42 ; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; VI-NEXT: flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc @@ -1893,8 +1893,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 42 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_mov_b32_e32 v3, s0 +; GFX9-NEXT: v_mov_b32_e32 v4, s1 ; GFX9-NEXT: v_add_co_u32_e32 v3, vcc, v3, v0 ; GFX9-NEXT: v_addc_co_u32_e32 v4, vcc, 0, v4, vcc ; GFX9-NEXT: flat_atomic_inc_x2 v[0:1], v[3:4], v[1:2] offset:40 glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -143,11 +143,11 @@ ; GFX7-NEXT: s_cmp_eq_u32 s6, 0 ; GFX7-NEXT: s_cselect_b32 s6, 1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-NEXT: s_and_b32 s0, 1, s6 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-NEXT: v_mov_b32_e32 v2, s2 -; GFX7-NEXT: v_mov_b32_e32 v4, s4 -; GFX7-NEXT: s_and_b32 s0, 1, s6 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 +; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v5, s5 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX7-NEXT: s_nop 3 @@ -161,11 +161,11 @@ ; GFX8-NEXT: s_cmp_eq_u32 s6, 0 ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_and_b32 s0, 1, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s2 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: s_and_b32 s0, 1, s6 ; GFX8-NEXT: v_mov_b32_e32 v3, s3 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX8-NEXT: s_nop 3 @@ -179,9 +179,9 @@ ; GFX10_W32-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W32-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 -; GFX10_W32-NEXT: s_and_b32 s6, 1, s6 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W32-NEXT: s_and_b32 s6, 1, s6 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s4 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s6 ; GFX10_W32-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] @@ -194,9 +194,9 @@ ; GFX10_W64-NEXT: s_cmp_eq_u32 s6, 0 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s2 ; GFX10_W64-NEXT: s_cselect_b32 s6, 1, 0 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s4 -; GFX10_W64-NEXT: s_and_b32 s6, 1, s6 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s3 +; GFX10_W64-NEXT: s_and_b32 s6, 1, s6 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s4 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s6 ; GFX10_W64-NEXT: v_mov_b32_e32 v3, s5 ; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[0:1], v[0:1], v[2:3] @@ -515,11 +515,11 @@ ; GFX7-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x9 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s2 +; GFX7-NEXT: s_and_b32 s2, 1, s8 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, s4 -; GFX7-NEXT: v_mov_b32_e32 v4, s6 -; GFX7-NEXT: s_and_b32 s2, 1, s8 ; GFX7-NEXT: v_mov_b32_e32 v3, s5 +; GFX7-NEXT: v_mov_b32_e32 v4, s6 ; GFX7-NEXT: v_mov_b32_e32 v5, s7 ; GFX7-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX7-NEXT: s_nop 3 @@ -535,11 +535,11 @@ ; GFX8-NEXT: s_load_dwordx8 s[0:7], s[0:1], 0x24 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 +; GFX8-NEXT: s_and_b32 s2, 1, s8 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v4, s6 -; GFX8-NEXT: s_and_b32 s2, 1, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 +; GFX8-NEXT: v_mov_b32_e32 v4, s6 ; GFX8-NEXT: v_mov_b32_e32 v5, s7 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 ; GFX8-NEXT: s_nop 3 @@ -557,9 +557,9 @@ ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W32-NEXT: s_and_b32 s0, 1, s2 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s8 -; GFX10_W32-NEXT: v_mov_b32_e32 v2, s10 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s9 +; GFX10_W32-NEXT: v_mov_b32_e32 v2, s10 ; GFX10_W32-NEXT: v_mov_b32_e32 v3, s11 ; GFX10_W32-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX10_W32-NEXT: v_mov_b32_e32 v2, 0 @@ -574,9 +574,9 @@ ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) ; GFX10_W64-NEXT: s_and_b32 s0, 1, s2 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s8 -; GFX10_W64-NEXT: v_mov_b32_e32 v2, s10 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s9 +; GFX10_W64-NEXT: v_mov_b32_e32 v2, s10 ; GFX10_W64-NEXT: v_mov_b32_e32 v3, s11 ; GFX10_W64-NEXT: v_div_fmas_f64 v[0:1], s[6:7], v[0:1], v[2:3] ; GFX10_W64-NEXT: v_mov_b32_e32 v2, 0 @@ -864,9 +864,9 @@ ; GFX8-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; GFX8-NEXT: v_cmp_ne_u32_e64 s[2:3], 0, s2 ; GFX8-NEXT: s_and_b64 vcc, vcc, s[2:3] -; GFX8-NEXT: s_nop 1 -; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 +; GFX8-NEXT: s_nop 0 +; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -997,10 +997,10 @@ ; GFX8-NEXT: s_addc_u32 s1, s3, 0 ; GFX8-NEXT: s_and_b32 s2, 1, s6 ; GFX8-NEXT: v_cmp_ne_u32_e64 vcc, 0, s2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_waitcnt vmcnt(0) -; GFX8-NEXT: s_nop 2 +; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_div_fmas_f32 v2, v1, v2, v3 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.scale.ll @@ -312,10 +312,10 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -372,10 +372,10 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -432,10 +432,10 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], s0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -492,10 +492,10 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[0:1], v0, s0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -970,10 +970,10 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], v0, v0, 1.0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; @@ -1026,10 +1026,10 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: v_div_scale_f32 v2, s[2:3], 2.0, 2.0, v0 ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -49,15 +49,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_and_or_b32 v10, v0, v4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_and_or_b32 v11, v2, v4, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -66,6 +64,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v11, v2, v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, v8 @@ -118,15 +118,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: v_mov_b32_e32 v4, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: v_and_or_b32 v10, v0, v4, v1 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 16, v3 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_and_or_b32 v11, v2, v4, v0 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s0, s2 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s2, s4 @@ -135,6 +133,8 @@ ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v11, v2, v4, v0 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -48,24 +48,24 @@ ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s8, s0, 16 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 +; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, v8 @@ -117,24 +117,24 @@ ; GFX9-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b32 s0, s2 +; GFX9-NEXT: v_mov_b32_e32 v5, 0 ; GFX9-NEXT: s_mov_b32 s2, s4 ; GFX9-NEXT: s_mov_b32 s4, s6 ; GFX9-NEXT: s_mov_b32 s6, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, 0xffff ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NEXT: s_lshl_b32 s8, s0, 16 -; GFX9-NEXT: v_mov_b32_e32 v5, 0 -; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 -; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 ; GFX9-NEXT: v_mov_b32_e32 v6, v5 ; GFX9-NEXT: v_mov_b32_e32 v7, v5 ; GFX9-NEXT: v_mov_b32_e32 v8, v5 ; GFX9-NEXT: v_mov_b32_e32 v9, v5 -; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_mov_b32 s1, s3 ; GFX9-NEXT: s_mov_b32 s3, s5 ; GFX9-NEXT: s_mov_b32 s5, s7 ; GFX9-NEXT: s_mov_b32 s7, s9 +; GFX9-NEXT: v_and_or_b32 v10, v0, v3, v1 +; GFX9-NEXT: v_and_or_b32 v11, v2, v3, s8 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: v_mov_b32_e32 v1, v6 ; GFX9-NEXT: v_mov_b32_e32 v2, v7 ; GFX9-NEXT: v_mov_b32_e32 v3, v8 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mfma.gfx90a.ll @@ -403,22 +403,22 @@ ; GCN-LABEL: test_mfma_f64_16x16x4f64_imm: ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_mov_b64 s[4:5], 0 -; GCN-NEXT: s_mov_b64 s[10:11], 1.0 ; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_accvgpr_write_b32 a0, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: v_accvgpr_write_b32 a2, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1] ; GCN-NEXT: v_accvgpr_write_b32 a3, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: s_mov_b64 s[10:11], 1.0 ; GCN-NEXT: v_accvgpr_write_b32 a4, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s9 ; GCN-NEXT: v_accvgpr_write_b32 a5, v4 @@ -447,22 +447,22 @@ ; GCN: ; %bb.0: ; %bb ; GCN-NEXT: s_mov_b32 s4, 0 ; GCN-NEXT: s_mov_b32 s5, 0x405ec000 -; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: s_load_dwordx4 s[12:15], s[0:1], 0x24 ; GCN-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 -; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-NEXT: v_mov_b32_e32 v4, s4 +; GCN-NEXT: s_mov_b64 s[6:7], s[4:5] ; GCN-NEXT: v_accvgpr_write_b32 a0, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s5 ; GCN-NEXT: v_accvgpr_write_b32 a1, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s6 +; GCN-NEXT: s_mov_b64 s[8:9], s[4:5] ; GCN-NEXT: v_accvgpr_write_b32 a2, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s7 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_pk_mov_b32 v[0:1], s[14:15], s[14:15] op_sel:[0,1] ; GCN-NEXT: v_accvgpr_write_b32 a3, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s8 +; GCN-NEXT: s_mov_b64 s[10:11], s[4:5] ; GCN-NEXT: v_accvgpr_write_b32 a4, v4 ; GCN-NEXT: v_mov_b32_e32 v4, s9 ; GCN-NEXT: v_accvgpr_write_b32 a5, v4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.mov.dpp.ll @@ -40,8 +40,9 @@ ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_dpp v0, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_dpp v1, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -8,12 +8,12 @@ ; GFX8-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v0, s1 ; GFX8-NEXT: s_nop 1 ; GFX8-NEXT: v_mov_b32_dpp v2, v0 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/mubuf-global.ll @@ -61,12 +61,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: s_mov_b32 s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -75,12 +75,12 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s4, 0 ; GFX7-NEXT: s_mov_b32 s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -94,12 +94,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 4 ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -108,12 +108,12 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s4, 4 ; GFX7-NEXT: s_mov_b32 s5, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -256,10 +256,10 @@ ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX6-NEXT: s_endpgm @@ -270,10 +270,10 @@ ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 ; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: v_mov_b32_e32 v2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_store_dword v2, v[0:1], s[0:3], 0 addr64 ; GFX7-NEXT: s_endpgm @@ -507,11 +507,11 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: s_mov_b32 s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -521,11 +521,11 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s4, 0 ; GFX7-NEXT: s_mov_b32 s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s4 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -540,11 +540,11 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 4 ; GFX6-NEXT: s_mov_b32 s5, s4 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -554,11 +554,11 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s4, 4 ; GFX7-NEXT: s_mov_b32 s5, s4 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -702,9 +702,9 @@ ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 ; GFX6-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_mov_b32 s2, 0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX6-NEXT: s_waitcnt vmcnt(0) @@ -716,9 +716,9 @@ ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: s_bfe_i64 s[2:3], s[4:5], 0x200000 ; GFX7-NEXT: s_lshl_b64 s[4:5], s[2:3], 2 -; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: s_mov_b32 s2, 0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 +; GFX7-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 ; GFX7-NEXT: buffer_load_dword v0, v[0:1], s[0:3], 0 addr64 glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -937,12 +937,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: s_mov_b32 s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: v_mov_b32_e32 v0, 2 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: v_mov_b32_e32 v1, s4 ; GFX6-NEXT: v_mov_b32_e32 v2, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc @@ -955,12 +955,12 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s4, 0 ; GFX7-NEXT: s_mov_b32 s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v0, 2 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s4 +; GFX7-NEXT: v_mov_b32_e32 v1, s4 ; GFX7-NEXT: v_mov_b32_e32 v2, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_atomic_add v0, v[1:2], s[0:3], 0 addr64 glc @@ -1125,12 +1125,12 @@ ; GFX6: ; %bb.0: ; GFX6-NEXT: s_mov_b32 s4, 0 ; GFX6-NEXT: s_mov_b32 s5, 4 -; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: s_mov_b32 s0, s2 ; GFX6-NEXT: s_mov_b32 s1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, v0 ; GFX6-NEXT: s_mov_b32 s3, 0xf000 ; GFX6-NEXT: s_mov_b32 s2, s4 +; GFX6-NEXT: v_mov_b32_e32 v3, s4 ; GFX6-NEXT: v_mov_b32_e32 v4, s5 ; GFX6-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX6-NEXT: buffer_atomic_cmpswap v[1:2], v[3:4], s[0:3], 0 addr64 glc @@ -1144,12 +1144,12 @@ ; GFX7: ; %bb.0: ; GFX7-NEXT: s_mov_b32 s4, 0 ; GFX7-NEXT: s_mov_b32 s5, 4 -; GFX7-NEXT: v_mov_b32_e32 v3, s4 ; GFX7-NEXT: s_mov_b32 s0, s2 ; GFX7-NEXT: s_mov_b32 s1, s3 ; GFX7-NEXT: v_mov_b32_e32 v2, v0 ; GFX7-NEXT: s_mov_b32 s3, 0xf000 ; GFX7-NEXT: s_mov_b32 s2, s4 +; GFX7-NEXT: v_mov_b32_e32 v3, s4 ; GFX7-NEXT: v_mov_b32_e32 v4, s5 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: buffer_atomic_cmpswap v[1:2], v[3:4], s[0:3], 0 addr64 glc diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4212,8 +4212,8 @@ ; GFX6-NEXT: s_cselect_b32 s5, 1, 0 ; GFX6-NEXT: s_and_b32 s5, s5, 1 ; GFX6-NEXT: s_cmp_lg_u32 s5, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s5, s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4240,8 +4240,8 @@ ; GFX8-NEXT: s_cselect_b32 s5, 1, 0 ; GFX8-NEXT: s_and_b32 s5, s5, 1 ; GFX8-NEXT: s_cmp_lg_u32 s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s5, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4268,8 +4268,8 @@ ; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: s_and_b32 s5, s5, 1 ; GFX9-NEXT: s_cmp_lg_u32 s5, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[2:3], 0 @@ -4566,8 +4566,8 @@ ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: s_and_b32 s9, s9, 1 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_addc_u32 s9, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 @@ -4583,13 +4583,13 @@ ; GFX6-NEXT: s_add_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX6-NEXT: s_addc_u32 s1, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4619,8 +4619,8 @@ ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 ; GFX8-NEXT: s_and_b32 s9, s9, 1 ; GFX8-NEXT: s_cmp_lg_u32 s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_addc_u32 s9, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 @@ -4636,13 +4636,13 @@ ; GFX8-NEXT: s_add_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX8-NEXT: s_addc_u32 s1, s3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4672,8 +4672,8 @@ ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: s_and_b32 s9, s9, 1 ; GFX9-NEXT: s_cmp_lg_u32 s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_addc_u32 s9, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_lt_i64_e64 s[0:1], s[4:5], 0 @@ -4689,13 +4689,13 @@ ; GFX9-NEXT: s_add_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: s_addc_u32 s1, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4780,13 +4780,13 @@ ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_addc_u32 s8, s2, s6 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: s_and_b32 s9, s9, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX6-NEXT: s_addc_u32 s9, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] @@ -4845,12 +4845,12 @@ ; GFX8-NEXT: s_addc_u32 s8, s2, s6 ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 ; GFX8-NEXT: s_and_b32 s9, s9, 1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_cmp_lg_u32 s9, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_addc_u32 s9, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 @@ -4915,12 +4915,12 @@ ; GFX9-NEXT: s_addc_u32 s8, s2, s6 ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: s_and_b32 s9, s9, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cmp_lg_u32 s9, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_addc_u32 s9, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[8:9], s[2:3] ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 @@ -5569,13 +5569,13 @@ ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_addc_u32 s16, s2, s10 ; GFX6-NEXT: s_cselect_b32 s17, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: s_and_b32 s17, s17, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_cmp_lg_u32 s17, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX6-NEXT: s_addc_u32 s17, s3, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[16:17], v[0:1] @@ -5614,25 +5614,25 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s8 ; GFX6-NEXT: v_mov_b32_e32 v4, s9 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_addc_u32 s2, s6, s14 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s16 ; GFX6-NEXT: v_mov_b32_e32 v3, s17 -; GFX6-NEXT: s_addc_u32 s2, s6, s14 +; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_cmp_lg_u32 s3, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s7, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] @@ -5695,12 +5695,12 @@ ; GFX8-NEXT: s_addc_u32 s16, s2, s10 ; GFX8-NEXT: s_cselect_b32 s17, 1, 0 ; GFX8-NEXT: s_and_b32 s17, s17, 1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_cmp_lg_u32 s17, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_addc_u32 s17, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 @@ -5747,23 +5747,23 @@ ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s8 -; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: s_addc_u32 s2, s6, s14 +; GFX8-NEXT: v_mov_b32_e32 v4, s9 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s16 ; GFX8-NEXT: v_mov_b32_e32 v3, s17 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_cmp_lg_u32 s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_addc_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 @@ -5832,12 +5832,12 @@ ; GFX9-NEXT: s_addc_u32 s16, s2, s10 ; GFX9-NEXT: s_cselect_b32 s17, 1, 0 ; GFX9-NEXT: s_and_b32 s17, s17, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cmp_lg_u32 s17, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_addc_u32 s17, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 @@ -5884,23 +5884,23 @@ ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s8 -; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: s_addc_u32 s2, s6, s14 +; GFX9-NEXT: v_mov_b32_e32 v4, s9 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s16 ; GFX9-NEXT: v_mov_b32_e32 v3, s17 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_addc_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/sdivrem.ll @@ -39,13 +39,13 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -692,6 +692,7 @@ ; GFX8-NEXT: v_rcp_iflag_f32_e32 v2, v2 ; GFX8-NEXT: v_mul_lo_u32 v1, s1, v0 ; GFX8-NEXT: s_ashr_i32 s2, s3, 31 +; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_mul_hi_u32 v1, v0, v1 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, v0, v1 ; GFX8-NEXT: v_mul_hi_u32 v0, s0, v0 @@ -735,13 +736,12 @@ ; GFX8-NEXT: s_xor_b32 s0, s2, s11 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_xor_b32_e32 v1, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v4, s4 ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 -; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_mov_b32_e32 v4, s4 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; GFX8-NEXT: s_endpgm @@ -913,6 +913,7 @@ ; GFX8-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x10 ; GFX8-NEXT: v_mov_b32_e32 v3, 0x4f7ffffe ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v9, s9 ; GFX8-NEXT: s_ashr_i32 s12, s0, 31 ; GFX8-NEXT: s_add_i32 s0, s0, s12 ; GFX8-NEXT: s_xor_b32 s13, s0, s12 @@ -1038,12 +1039,12 @@ ; GFX8-NEXT: s_xor_b32 s0, s2, s4 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v7, v8, vcc ; GFX8-NEXT: v_xor_b32_e32 v3, s0, v3 -; GFX8-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s0, v3 -; GFX8-NEXT: v_mov_b32_e32 v9, s9 ; GFX8-NEXT: v_xor_b32_e32 v7, s2, v7 -; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: v_mov_b32_e32 v8, s8 ; GFX8-NEXT: v_subrev_u32_e32 v7, vcc, s2, v7 +; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] +; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: flat_store_dwordx4 v[0:1], v[4:7] @@ -2400,13 +2401,13 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -2581,12 +2582,12 @@ ; GFX8-NEXT: v_subrev_u32_e32 v1, vcc, s0, v1 ; GFX8-NEXT: s_movk_i32 s0, 0xff ; GFX8-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 ; GFX8-NEXT: v_xor_b32_e32 v3, s2, v3 +; GFX8-NEXT: v_lshlrev_b16_e32 v1, 8, v1 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_or_b32_sdwa v4, v0, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:BYTE_0 src1_sel:DWORD ; GFX8-NEXT: s_waitcnt lgkmcnt(0) ; GFX8-NEXT: v_mov_b32_e32 v0, s4 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s2, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_short v[0:1], v4 ; GFX8-NEXT: v_and_b32_e32 v0, s0, v3 @@ -2811,13 +2812,13 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s8, v3 +; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -3223,15 +3224,15 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_xor_b32_e32 v2, s4, v2 ; GFX8-NEXT: v_subrev_u32_e32 v2, vcc, s4, v2 ; GFX8-NEXT: v_xor_b32_e32 v3, s8, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_subrev_u32_e32 v3, vcc, s8, v3 +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -3381,8 +3382,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v3 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_and_b32_e32 v2, s9, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/shl-ext-reduce.ll @@ -232,8 +232,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] ; GFX8-NEXT: s_waitcnt lgkmcnt(0) -; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_mov_b32_e32 v4, s0 +; GFX8-NEXT: v_mov_b32_e32 v5, s1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v4, v2 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, v5, v3, vcc ; GFX8-NEXT: flat_store_dword v[2:3], v1 @@ -247,8 +247,8 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_mov_b32_e32 v4, s0 +; GFX9-NEXT: v_mov_b32_e32 v5, s1 ; GFX9-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 ; GFX9-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc ; GFX9-NEXT: global_store_dword v[2:3], v1, off @@ -262,8 +262,8 @@ ; GFX10-NEXT: v_mul_u32_u24_e32 v0, 7, v0 ; GFX10-NEXT: v_lshlrev_b64 v[2:3], 2, v[0:1] ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 +; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v4, v2 ; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, v5, v3, vcc_lo ; GFX10-NEXT: global_store_dword v[2:3], v1, off @@ -290,8 +290,8 @@ ; GFX7-NEXT: s_mov_b64 s[4:5], s[2:3] ; GFX7-NEXT: buffer_load_dword v1, v[1:2], s[4:7], 0 addr64 ; GFX7-NEXT: v_lshlrev_b32_e32 v5, 3, v0 -; GFX7-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-NEXT: v_mov_b32_e32 v3, s0 +; GFX7-NEXT: v_mov_b32_e32 v4, s1 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: v_or_b32_e32 v0, 0xff800000, v1 ; GFX7-NEXT: v_mul_i32_i24_e32 v1, -7, v0 @@ -312,9 +312,9 @@ ; GFX8-NEXT: v_add_u32_e32 v1, vcc, v1, v3 ; GFX8-NEXT: v_addc_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: flat_load_dword v4, v[1:2] -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_add_u32_e32 v2, vcc, v2, v5 ; GFX8-NEXT: v_addc_u32_e32 v3, vcc, 0, v3, vcc ; GFX8-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4198,8 +4198,8 @@ ; GFX6-NEXT: s_cselect_b32 s5, 1, 0 ; GFX6-NEXT: s_and_b32 s5, s5, 1 ; GFX6-NEXT: s_cmp_lg_u32 s5, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s5, s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4226,8 +4226,8 @@ ; GFX8-NEXT: s_cselect_b32 s5, 1, 0 ; GFX8-NEXT: s_and_b32 s5, s5, 1 ; GFX8-NEXT: s_cmp_lg_u32 s5, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4254,8 +4254,8 @@ ; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: s_and_b32 s5, s5, 1 ; GFX9-NEXT: s_cmp_lg_u32 s5, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[4:5], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[2:3], 0 @@ -4552,8 +4552,8 @@ ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: s_and_b32 s9, s9, 1 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s0 ; GFX6-NEXT: v_mov_b32_e32 v1, s1 ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX6-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 @@ -4569,13 +4569,13 @@ ; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s1 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX6-NEXT: v_mov_b32_e32 v0, s8 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 +; GFX6-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX6-NEXT: s_subb_u32 s1, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4605,8 +4605,8 @@ ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 ; GFX8-NEXT: s_and_b32 s9, s9, 1 ; GFX8-NEXT: s_cmp_lg_u32 s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX8-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 @@ -4622,13 +4622,13 @@ ; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s1 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX8-NEXT: v_mov_b32_e32 v0, s8 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX8-NEXT: s_subb_u32 s1, s3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX8-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4658,8 +4658,8 @@ ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: s_and_b32 s9, s9, 1 ; GFX9-NEXT: s_cmp_lg_u32 s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[8:9], v[0:1] ; GFX9-NEXT: v_cmp_gt_i64_e64 s[0:1], s[4:5], 0 @@ -4675,13 +4675,13 @@ ; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s1 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc +; GFX9-NEXT: v_mov_b32_e32 v0, s8 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v0, v1, vcc ; GFX9-NEXT: s_subb_u32 s1, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cndmask_b32_e32 v2, v3, v2, vcc ; GFX9-NEXT: v_cmp_lt_i64_e32 vcc, s[0:1], v[0:1] @@ -4766,13 +4766,13 @@ ; GFX6-NEXT: s_cmp_lg_u32 s10, 0 ; GFX6-NEXT: s_subb_u32 s10, s2, s6 ; GFX6-NEXT: s_cselect_b32 s11, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: s_and_b32 s11, s11, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_cmp_lg_u32 s11, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] ; GFX6-NEXT: s_subb_u32 s11, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[10:11], v[0:1] @@ -4833,12 +4833,12 @@ ; GFX8-NEXT: s_subb_u32 s10, s2, s6 ; GFX8-NEXT: s_cselect_b32 s11, 1, 0 ; GFX8-NEXT: s_and_b32 s11, s11, 1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_subb_u32 s11, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 @@ -4905,12 +4905,12 @@ ; GFX9-NEXT: s_subb_u32 s10, s2, s6 ; GFX9-NEXT: s_cselect_b32 s11, 1, 0 ; GFX9-NEXT: s_and_b32 s11, s11, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_subb_u32 s11, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 @@ -5595,13 +5595,13 @@ ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 ; GFX6-NEXT: s_subb_u32 s18, s2, s10 ; GFX6-NEXT: s_cselect_b32 s19, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: s_and_b32 s19, s19, 1 +; GFX6-NEXT: v_mov_b32_e32 v3, s1 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: s_cmp_lg_u32 s19, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] ; GFX6-NEXT: s_subb_u32 s19, s3, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[18:19], v[0:1] @@ -5642,25 +5642,25 @@ ; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 ; GFX6-NEXT: s_and_b32 s2, s2, 1 +; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: v_mov_b32_e32 v3, s16 ; GFX6-NEXT: v_mov_b32_e32 v4, s17 -; GFX6-NEXT: s_cmp_lg_u32 s2, 0 +; GFX6-NEXT: s_subb_u32 s2, s6, s14 ; GFX6-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_mov_b32_e32 v2, s18 ; GFX6-NEXT: v_mov_b32_e32 v3, s19 -; GFX6-NEXT: s_subb_u32 s2, s6, s14 +; GFX6-NEXT: s_cselect_b32 s3, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX6-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_cmp_lg_u32 s3, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_subb_u32 s3, s7, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_i64_e32 vcc, s[2:3], v[0:1] @@ -5725,12 +5725,12 @@ ; GFX8-NEXT: s_subb_u32 s18, s2, s10 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: s_and_b32 s19, s19, 1 -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: s_subb_u32 s19, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 @@ -5779,23 +5779,23 @@ ; GFX8-NEXT: s_and_b32 s2, s2, 1 ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 ; GFX8-NEXT: v_mov_b32_e32 v3, s16 -; GFX8-NEXT: v_mov_b32_e32 v4, s17 ; GFX8-NEXT: s_subb_u32 s2, s6, s14 +; GFX8-NEXT: v_mov_b32_e32 v4, s17 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 +; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX8-NEXT: v_mov_b32_e32 v2, s18 ; GFX8-NEXT: v_mov_b32_e32 v3, s19 -; GFX8-NEXT: s_cselect_b32 s3, 1, 0 +; GFX8-NEXT: s_and_b32 s3, s3, 1 ; GFX8-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_cmp_lg_u32 s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_subb_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 @@ -5866,12 +5866,12 @@ ; GFX9-NEXT: s_subb_u32 s18, s2, s10 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 ; GFX9-NEXT: s_and_b32 s19, s19, 1 -; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: v_mov_b32_e32 v3, s1 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: s_subb_u32 s19, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 @@ -5920,23 +5920,23 @@ ; GFX9-NEXT: s_and_b32 s2, s2, 1 ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, s16 -; GFX9-NEXT: v_mov_b32_e32 v4, s17 ; GFX9-NEXT: s_subb_u32 s2, s6, s14 +; GFX9-NEXT: v_mov_b32_e32 v4, s17 ; GFX9-NEXT: v_cndmask_b32_e32 v5, v3, v1, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s3 +; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v2, vcc ; GFX9-NEXT: v_mov_b32_e32 v2, s18 ; GFX9-NEXT: v_mov_b32_e32 v3, s19 -; GFX9-NEXT: s_cselect_b32 s3, 1, 0 +; GFX9-NEXT: s_and_b32 s3, s3, 1 ; GFX9-NEXT: v_cndmask_b32_e32 v6, v2, v0, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v7, v3, v1, vcc -; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_subb_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -2610,8 +2610,8 @@ ; GFX6-NEXT: s_cselect_b32 s4, 1, 0 ; GFX6-NEXT: s_and_b32 s4, s4, 1 ; GFX6-NEXT: s_cmp_lg_u32 s4, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_addc_u32 s1, s1, s3 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: v_mov_b32_e32 v2, s0 @@ -2628,8 +2628,8 @@ ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 ; GFX8-NEXT: s_and_b32 s4, s4, 1 ; GFX8-NEXT: s_cmp_lg_u32 s4, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_addc_u32 s1, s1, s3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v2, s0 @@ -2646,8 +2646,8 @@ ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 ; GFX9-NEXT: s_and_b32 s4, s4, 1 ; GFX9-NEXT: s_cmp_lg_u32 s4, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_addc_u32 s1, s1, s3 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: v_mov_b32_e32 v2, s0 @@ -2835,8 +2835,8 @@ ; GFX6-NEXT: s_cselect_b32 s8, 1, 0 ; GFX6-NEXT: s_and_b32 s8, s8, 1 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_addc_u32 s1, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_mov_b32_e32 v2, s0 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] @@ -2845,8 +2845,8 @@ ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: s_and_b32 s1, s1, 1 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: s_addc_u32 s1, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc @@ -2867,8 +2867,8 @@ ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 ; GFX8-NEXT: s_and_b32 s8, s8, 1 ; GFX8-NEXT: s_cmp_lg_u32 s8, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_addc_u32 s1, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] @@ -2877,8 +2877,8 @@ ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_and_b32 s1, s1, 1 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: s_addc_u32 s1, s3, s7 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc @@ -2899,8 +2899,8 @@ ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 ; GFX9-NEXT: s_and_b32 s8, s8, 1 ; GFX9-NEXT: s_cmp_lg_u32 s8, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_addc_u32 s1, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_mov_b32_e32 v2, s0 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] @@ -2909,8 +2909,8 @@ ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: s_and_b32 s1, s1, 1 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: s_addc_u32 s1, s3, s7 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc @@ -2965,13 +2965,13 @@ ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 ; GFX6-NEXT: s_addc_u32 s2, s2, s6 ; GFX6-NEXT: s_cselect_b32 s8, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_and_b32 s8, s8, 1 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_cmp_lg_u32 s8, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s3, s7 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] @@ -3007,12 +3007,12 @@ ; GFX8-NEXT: s_addc_u32 s2, s2, s6 ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 ; GFX8-NEXT: s_and_b32 s8, s8, 1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_cmp_lg_u32 s8, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_addc_u32 s3, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 @@ -3051,12 +3051,12 @@ ; GFX9-NEXT: s_addc_u32 s2, s2, s6 ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 ; GFX9-NEXT: s_and_b32 s8, s8, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_cmp_lg_u32 s8, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_addc_u32 s3, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 @@ -3475,13 +3475,13 @@ ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 ; GFX6-NEXT: s_addc_u32 s2, s2, s10 ; GFX6-NEXT: s_cselect_b32 s16, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: s_and_b32 s16, s16, 1 +; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_cmp_lg_u32 s16, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s3, s11 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 ; GFX6-NEXT: v_mov_b32_e32 v1, s11 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] @@ -3503,18 +3503,18 @@ ; GFX6-NEXT: s_cmp_lg_u32 s2, 0 ; GFX6-NEXT: s_addc_u32 s2, s6, s14 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc -; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_cselect_b32 s3, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 +; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc ; GFX6-NEXT: s_and_b32 s3, s3, 1 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mov_b32_e32 v3, s13 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc ; GFX6-NEXT: s_cmp_lg_u32 s3, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_addc_u32 s3, s7, s15 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] @@ -3554,12 +3554,12 @@ ; GFX8-NEXT: s_addc_u32 s2, s2, s10 ; GFX8-NEXT: s_cselect_b32 s16, 1, 0 ; GFX8-NEXT: s_and_b32 s16, s16, 1 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: s_cmp_lg_u32 s16, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_addc_u32 s3, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] ; GFX8-NEXT: s_cselect_b32 s10, 1, 0 @@ -3586,16 +3586,16 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_cselect_b32 s3, 1, 0 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc ; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: v_mov_b32_e32 v2, s12 +; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc ; GFX8-NEXT: s_cmp_lg_u32 s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc ; GFX8-NEXT: s_addc_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: v_mov_b32_e32 v1, s15 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[14:15] ; GFX8-NEXT: s_cselect_b32 s4, 1, 0 @@ -3638,12 +3638,12 @@ ; GFX9-NEXT: s_addc_u32 s2, s2, s10 ; GFX9-NEXT: s_cselect_b32 s16, 1, 0 ; GFX9-NEXT: s_and_b32 s16, s16, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_cmp_lg_u32 s16, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_addc_u32 s3, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] ; GFX9-NEXT: s_cselect_b32 s10, 1, 0 @@ -3670,16 +3670,16 @@ ; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, -1, vcc ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_cselect_b32 s3, 1, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc ; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s12 +; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, -1, vcc ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mov_b32_e32 v3, s13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, -1, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, -1, vcc ; GFX9-NEXT: s_addc_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[14:15] ; GFX9-NEXT: s_cselect_b32 s4, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/udivrem.ll @@ -32,9 +32,9 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -246,16 +246,16 @@ ; GFX8-NEXT: v_subbrev_u32_e32 v2, vcc, 0, v2, vcc ; GFX8-NEXT: v_addc_u32_e64 v13, s[0:1], 0, v10, s[0:1] ; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v11 +; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc +; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v5 ; GFX8-NEXT: v_cndmask_b32_e32 v5, v7, v6, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v6, v8, v2, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v9, v9, v12, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v10, v10, v13, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] +; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v5, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v3, v4, v6, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v4, s4 -; GFX8-NEXT: v_cndmask_b32_e64 v0, v0, v9, s[0:1] -; GFX8-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v5, s5 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s6 @@ -562,6 +562,7 @@ ; GFX8-NEXT: s_load_dwordx4 s[8:11], s[4:5], 0x0 ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x10 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: v_cvt_f32_u32_e32 v0, s2 ; GFX8-NEXT: v_cvt_f32_u32_e32 v1, s3 ; GFX8-NEXT: s_sub_i32 s0, 0, s2 @@ -605,7 +606,6 @@ ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s3, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v4, s8 -; GFX8-NEXT: v_mov_b32_e32 v5, s9 ; GFX8-NEXT: flat_store_dwordx2 v[4:5], v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 @@ -740,6 +740,7 @@ ; GFX8-NEXT: v_cvt_f32_u32_e32 v6, s10 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v0, v0 ; GFX8-NEXT: v_rcp_iflag_f32_e32 v1, v1 +; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: v_mul_f32_e32 v0, 0x4f7ffffe, v0 ; GFX8-NEXT: v_cvt_u32_f32_e32 v0, v0 ; GFX8-NEXT: v_mul_f32_e32 v1, v1, v2 @@ -819,7 +820,6 @@ ; GFX8-NEXT: v_subrev_u32_e64 v7, s[0:1], s11, v8 ; GFX8-NEXT: v_cndmask_b32_e32 v7, v8, v7, vcc ; GFX8-NEXT: v_mov_b32_e32 v8, s12 -; GFX8-NEXT: v_mov_b32_e32 v9, s13 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_nop 0 ; GFX8-NEXT: v_mov_b32_e32 v0, s14 @@ -1310,14 +1310,14 @@ ; GFX8-NEXT: v_subbrev_u32_e64 v6, s[0:1], 0, v6, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v10, v14, v10, vcc ; GFX8-NEXT: v_cmp_ne_u32_e64 s[0:1], 0, v9 +; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v10, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e32 v9, v11, v13, vcc ; GFX8-NEXT: v_cndmask_b32_e32 v10, v12, v6, vcc -; GFX8-NEXT: v_cndmask_b32_e32 v14, v15, v16, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v6, v7, v9, s[0:1] ; GFX8-NEXT: v_cndmask_b32_e64 v7, v8, v10, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v9, s5 -; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, v14, s[0:1] ; GFX8-NEXT: v_mov_b32_e32 v8, s4 ; GFX8-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GFX8-NEXT: s_nop 0 @@ -1923,9 +1923,9 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -2234,9 +2234,9 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 +; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: flat_store_short v[0:1], v2 ; GFX8-NEXT: v_mov_b32_e32 v0, s2 -; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -2551,11 +2551,11 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s6, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s6, v3 -; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_and_b32_e32 v2, 7, v2 ; GFX8-NEXT: flat_store_byte v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, 7, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_byte v[0:1], v2 ; GFX8-NEXT: s_endpgm @@ -2670,11 +2670,11 @@ ; GFX8-NEXT: v_cmp_le_u32_e32 vcc, s7, v3 ; GFX8-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc ; GFX8-NEXT: v_subrev_u32_e64 v4, s[0:1], s7, v3 -; GFX8-NEXT: v_and_b32_e32 v2, s6, v2 ; GFX8-NEXT: v_cndmask_b32_e32 v3, v3, v4, vcc +; GFX8-NEXT: v_and_b32_e32 v2, s6, v2 ; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_and_b32_e32 v2, s6, v3 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -2479,8 +2479,8 @@ ; GFX6-NEXT: s_sub_u32 s4, s0, s2 ; GFX6-NEXT: s_cselect_b32 s5, 1, 0 ; GFX6-NEXT: s_and_b32 s5, s5, 1 -; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: s_cmp_lg_u32 s5, 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s2 ; GFX6-NEXT: v_mov_b32_e32 v1, s3 ; GFX6-NEXT: s_subb_u32 s5, s1, s3 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] @@ -2497,8 +2497,8 @@ ; GFX8-NEXT: s_sub_u32 s4, s0, s2 ; GFX8-NEXT: s_cselect_b32 s5, 1, 0 ; GFX8-NEXT: s_and_b32 s5, s5, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: s_cmp_lg_u32 s5, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s2 ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: s_subb_u32 s5, s1, s3 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] @@ -2515,8 +2515,8 @@ ; GFX9-NEXT: s_sub_u32 s4, s0, s2 ; GFX9-NEXT: s_cselect_b32 s5, 1, 0 ; GFX9-NEXT: s_and_b32 s5, s5, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: s_cmp_lg_u32 s5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s2 ; GFX9-NEXT: v_mov_b32_e32 v1, s3 ; GFX9-NEXT: s_subb_u32 s5, s1, s3 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] @@ -2705,17 +2705,17 @@ ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: s_and_b32 s9, s9, 1 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0 -; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 +; GFX6-NEXT: v_mov_b32_e32 v0, s4 ; GFX6-NEXT: v_mov_b32_e32 v1, s5 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX6-NEXT: s_sub_u32 s0, s2, s6 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc @@ -2737,17 +2737,17 @@ ; GFX8-NEXT: s_cselect_b32 s9, 1, 0 ; GFX8-NEXT: s_and_b32 s9, s9, 1 ; GFX8-NEXT: s_cmp_lg_u32 s9, 0 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: s_subb_u32 s9, s1, s5 +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX8-NEXT: s_sub_u32 s0, s2, s6 ; GFX8-NEXT: s_cselect_b32 s1, 1, 0 ; GFX8-NEXT: s_and_b32 s1, s1, 1 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_cmp_lg_u32 s1, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc @@ -2769,17 +2769,17 @@ ; GFX9-NEXT: s_cselect_b32 s9, 1, 0 ; GFX9-NEXT: s_and_b32 s9, s9, 1 ; GFX9-NEXT: s_cmp_lg_u32 s9, 0 -; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: s_subb_u32 s9, s1, s5 +; GFX9-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[0:1] ; GFX9-NEXT: s_sub_u32 s0, s2, s6 ; GFX9-NEXT: s_cselect_b32 s1, 1, 0 ; GFX9-NEXT: s_and_b32 s1, s1, 1 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_cmp_lg_u32 s1, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, 0, vcc @@ -2828,13 +2828,13 @@ ; GFX6-NEXT: s_sub_u32 s8, s0, s4 ; GFX6-NEXT: s_cselect_b32 s9, 1, 0 ; GFX6-NEXT: s_and_b32 s9, s9, 1 -; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: s_cmp_lg_u32 s9, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, s4 ; GFX6-NEXT: v_mov_b32_e32 v3, s5 ; GFX6-NEXT: s_subb_u32 s9, s1, s5 -; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_cselect_b32 s10, 1, 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: v_mov_b32_e32 v1, s7 ; GFX6-NEXT: s_and_b32 s10, s10, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -2877,12 +2877,12 @@ ; GFX8-NEXT: s_subb_u32 s10, s2, s6 ; GFX8-NEXT: s_cselect_b32 s11, 1, 0 ; GFX8-NEXT: s_and_b32 s11, s11, 1 -; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: s_cmp_lg_u32 s11, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s4 ; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_subb_u32 s11, s3, s7 -; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s6 ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX8-NEXT: s_cselect_b32 s6, 1, 0 @@ -2921,12 +2921,12 @@ ; GFX9-NEXT: s_subb_u32 s10, s2, s6 ; GFX9-NEXT: s_cselect_b32 s11, 1, 0 ; GFX9-NEXT: s_and_b32 s11, s11, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: s_cmp_lg_u32 s11, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s4 ; GFX9-NEXT: v_mov_b32_e32 v3, s5 ; GFX9-NEXT: s_subb_u32 s11, s3, s7 -; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[6:7] ; GFX9-NEXT: s_cselect_b32 s6, 1, 0 @@ -3340,44 +3340,44 @@ ; GFX6-NEXT: s_and_b32 s17, s17, 1 ; GFX6-NEXT: s_cmp_lg_u32 s17, 0 ; GFX6-NEXT: s_subb_u32 s17, s1, s9 -; GFX6-NEXT: v_mov_b32_e32 v2, s8 ; GFX6-NEXT: s_cselect_b32 s18, 1, 0 -; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_and_b32 s18, s18, 1 -; GFX6-NEXT: v_mov_b32_e32 v0, s10 -; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX6-NEXT: v_mov_b32_e32 v2, s8 +; GFX6-NEXT: v_mov_b32_e32 v3, s9 ; GFX6-NEXT: s_cmp_lg_u32 s18, 0 -; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] ; GFX6-NEXT: s_subb_u32 s18, s2, s10 +; GFX6-NEXT: v_mov_b32_e32 v0, s10 +; GFX6-NEXT: v_mov_b32_e32 v1, s11 +; GFX6-NEXT: s_cselect_b32 s19, 1, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: s_cselect_b32 s19, 1, 0 ; GFX6-NEXT: s_and_b32 s19, s19, 1 +; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: v_cndmask_b32_e64 v3, 0, 1, vcc ; GFX6-NEXT: v_cmp_eq_u64_e32 vcc, s[2:3], v[0:1] -; GFX6-NEXT: s_cmp_lg_u32 s19, 0 ; GFX6-NEXT: s_subb_u32 s19, s3, s11 ; GFX6-NEXT: v_cndmask_b32_e32 v0, v3, v2, vcc -; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_sub_u32 s0, s4, s12 -; GFX6-NEXT: v_mov_b32_e32 v2, s17 -; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX6-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX6-NEXT: s_cselect_b32 s1, 1, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, s16 -; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc +; GFX6-NEXT: v_mov_b32_e32 v2, s17 +; GFX6-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX6-NEXT: s_and_b32 s1, s1, 1 -; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc +; GFX6-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc ; GFX6-NEXT: v_mov_b32_e32 v0, s18 ; GFX6-NEXT: v_mov_b32_e32 v1, s19 ; GFX6-NEXT: s_cmp_lg_u32 s1, 0 +; GFX6-NEXT: v_mov_b32_e32 v2, s12 ; GFX6-NEXT: v_mov_b32_e32 v3, s13 ; GFX6-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GFX6-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc ; GFX6-NEXT: s_subb_u32 s1, s5, s13 -; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] ; GFX6-NEXT: s_cselect_b32 s2, 1, 0 +; GFX6-NEXT: v_mov_b32_e32 v0, s14 ; GFX6-NEXT: v_mov_b32_e32 v1, s15 ; GFX6-NEXT: s_and_b32 s2, s2, 1 ; GFX6-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc @@ -3424,12 +3424,12 @@ ; GFX8-NEXT: s_subb_u32 s18, s2, s10 ; GFX8-NEXT: s_cselect_b32 s19, 1, 0 ; GFX8-NEXT: s_and_b32 s19, s19, 1 -; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: s_cmp_lg_u32 s19, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s8 ; GFX8-NEXT: v_mov_b32_e32 v3, s9 ; GFX8-NEXT: s_subb_u32 s19, s3, s11 -; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s10 ; GFX8-NEXT: v_mov_b32_e32 v1, s11 ; GFX8-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] ; GFX8-NEXT: s_cselect_b32 s10, 1, 0 @@ -3445,27 +3445,27 @@ ; GFX8-NEXT: s_subb_u32 s1, s5, s13 ; GFX8-NEXT: s_cselect_b32 s2, 1, 0 ; GFX8-NEXT: s_and_b32 s2, s2, 1 -; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_cmp_lg_u32 s2, 0 -; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX8-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX8-NEXT: s_subb_u32 s2, s6, s14 -; GFX8-NEXT: v_mov_b32_e32 v2, s17 -; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX8-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX8-NEXT: s_cselect_b32 s3, 1, 0 ; GFX8-NEXT: v_mov_b32_e32 v1, s16 -; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc +; GFX8-NEXT: v_mov_b32_e32 v2, s17 +; GFX8-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX8-NEXT: s_and_b32 s3, s3, 1 -; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc +; GFX8-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc ; GFX8-NEXT: v_mov_b32_e32 v0, s18 ; GFX8-NEXT: v_mov_b32_e32 v1, s19 ; GFX8-NEXT: s_cmp_lg_u32 s3, 0 +; GFX8-NEXT: v_mov_b32_e32 v2, s12 ; GFX8-NEXT: v_mov_b32_e32 v3, s13 ; GFX8-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GFX8-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc ; GFX8-NEXT: s_subb_u32 s3, s7, s15 -; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX8-NEXT: v_mov_b32_e32 v0, s14 ; GFX8-NEXT: v_mov_b32_e32 v1, s15 ; GFX8-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] ; GFX8-NEXT: s_cselect_b32 s8, 1, 0 @@ -3508,12 +3508,12 @@ ; GFX9-NEXT: s_subb_u32 s18, s2, s10 ; GFX9-NEXT: s_cselect_b32 s19, 1, 0 ; GFX9-NEXT: s_and_b32 s19, s19, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: s_cmp_lg_u32 s19, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s8 ; GFX9-NEXT: v_mov_b32_e32 v3, s9 ; GFX9-NEXT: s_subb_u32 s19, s3, s11 -; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[0:1], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s10 ; GFX9-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] ; GFX9-NEXT: s_cselect_b32 s10, 1, 0 @@ -3529,27 +3529,27 @@ ; GFX9-NEXT: s_subb_u32 s1, s5, s13 ; GFX9-NEXT: s_cselect_b32 s2, 1, 0 ; GFX9-NEXT: s_and_b32 s2, s2, 1 -; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_cmp_lg_u32 s2, 0 -; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v2, vcc ; GFX9-NEXT: s_subb_u32 s2, s6, s14 -; GFX9-NEXT: v_mov_b32_e32 v2, s17 -; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 +; GFX9-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX9-NEXT: s_cselect_b32 s3, 1, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, s16 -; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc +; GFX9-NEXT: v_mov_b32_e32 v2, s17 +; GFX9-NEXT: v_cmp_ne_u32_e32 vcc, 0, v0 ; GFX9-NEXT: s_and_b32 s3, s3, 1 -; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_cndmask_b32_e64 v4, v1, 0, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v5, v2, 0, vcc ; GFX9-NEXT: v_mov_b32_e32 v0, s18 ; GFX9-NEXT: v_mov_b32_e32 v1, s19 ; GFX9-NEXT: s_cmp_lg_u32 s3, 0 +; GFX9-NEXT: v_mov_b32_e32 v2, s12 ; GFX9-NEXT: v_mov_b32_e32 v3, s13 ; GFX9-NEXT: v_cndmask_b32_e64 v6, v0, 0, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v7, v1, 0, vcc ; GFX9-NEXT: s_subb_u32 s3, s7, s15 -; GFX9-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-NEXT: v_cmp_lt_u64_e32 vcc, s[4:5], v[2:3] +; GFX9-NEXT: v_mov_b32_e32 v0, s14 ; GFX9-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] ; GFX9-NEXT: s_cselect_b32 s8, 1, 0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/widen-i8-i16-scalar-loads.ll @@ -338,13 +338,13 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_sbyte v2, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_add_u32 s0, s0, 2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_short v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm @@ -387,13 +387,13 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s3 ; GFX8-NEXT: flat_load_ubyte v2, v[0:1] ; GFX8-NEXT: v_mov_b32_e32 v0, s0 -; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_add_u32 s0, s0, 2 +; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: s_addc_u32 s1, s1, 0 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: flat_store_short v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_lshrrev_b32_e32 v3, 16, v2 +; GFX8-NEXT: v_mov_b32_e32 v0, s0 ; GFX8-NEXT: v_mov_b32_e32 v1, s1 ; GFX8-NEXT: flat_store_short v[0:1], v3 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -342,8 +342,8 @@ define amdgpu_kernel void @marked_kernel_use_other_sgpr(i64 addrspace(1)* %ptr) #0 { ; VARABI-LABEL: marked_kernel_use_other_sgpr: ; VARABI: ; %bb.0: -; VARABI-NEXT: s_add_u32 s0, s4, 8 ; VARABI-NEXT: flat_load_ubyte v0, v[0:1] glc +; VARABI-NEXT: s_add_u32 s0, s4, 8 ; VARABI-NEXT: s_addc_u32 s1, s5, 0 ; VARABI-NEXT: s_waitcnt vmcnt(0) ; VARABI-NEXT: v_mov_b32_e32 v0, s0 @@ -355,8 +355,8 @@ ; ; FIXEDABI-LABEL: marked_kernel_use_other_sgpr: ; FIXEDABI: ; %bb.0: -; FIXEDABI-NEXT: s_add_u32 s0, s4, 8 ; FIXEDABI-NEXT: flat_load_ubyte v0, v[0:1] glc +; FIXEDABI-NEXT: s_add_u32 s0, s4, 8 ; FIXEDABI-NEXT: s_addc_u32 s1, s5, 0 ; FIXEDABI-NEXT: s_waitcnt vmcnt(0) ; FIXEDABI-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir --- a/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/accvgpr-copy.mir @@ -71,13 +71,13 @@ ; GFX908-LABEL: name: a2_to_v2 ; GFX908: liveins: $agpr0_agpr1 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1 ; GFX90A-LABEL: name: a2_to_v2 ; GFX90A: liveins: $agpr0_agpr1 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr0_agpr1 - ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit killed $agpr0_agpr1, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr1, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1 $vgpr0_vgpr1 = COPY killed $agpr0_agpr1, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1 @@ -92,15 +92,15 @@ ; GFX908-LABEL: name: a3_to_v3 ; GFX908: liveins: $agpr0_agpr1_agpr2 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 ; GFX90A-LABEL: name: a3_to_v3 ; GFX90A: liveins: $agpr0_agpr1_agpr2 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $agpr0_agpr1_agpr2 - ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2 - ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit killed $agpr0_agpr1_agpr2, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr2, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 $vgpr0_vgpr1_vgpr2 = COPY killed $agpr0_agpr1_agpr2, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2 @@ -114,17 +114,17 @@ liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX908-LABEL: name: a4_to_v4 ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX90A-LABEL: name: a4_to_v4 ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3 @@ -139,25 +139,25 @@ ; GFX908-LABEL: name: a8_to_v8 ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec + ; GFX908: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec + ; GFX908: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec + ; GFX908: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX90A-LABEL: name: a8_to_v8 ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec + ; GFX90A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec + ; GFX90A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec + ; GFX90A: $vgpr7 = V_ACCVGPR_READ_B32_e64 killed $agpr7, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 @@ -171,41 +171,41 @@ liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GFX908-LABEL: name: a16_to_v16 ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec + ; GFX908: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec + ; GFX908: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec + ; GFX908: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec + ; GFX908: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec + ; GFX908: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec + ; GFX908: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec + ; GFX908: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec + ; GFX908: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec + ; GFX908: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec + ; GFX908: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec + ; GFX908: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX90A-LABEL: name: a16_to_v16 ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr15 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90A: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90A: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec + ; GFX90A: $vgpr5 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec + ; GFX90A: $vgpr6 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec + ; GFX90A: $vgpr7 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec + ; GFX90A: $vgpr8 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec + ; GFX90A: $vgpr9 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec + ; GFX90A: $vgpr10 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec + ; GFX90A: $vgpr11 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec + ; GFX90A: $vgpr12 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec + ; GFX90A: $vgpr13 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec + ; GFX90A: $vgpr14 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec + ; GFX90A: $vgpr15 = V_ACCVGPR_READ_B32_e64 killed $agpr15, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 = COPY killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $exec S_ENDPGM 0, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 @@ -237,13 +237,13 @@ liveins: $vgpr0_vgpr1 ; GFX908-LABEL: name: v2_to_a2 ; GFX908: liveins: $vgpr0_vgpr1 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 - ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1 ; GFX90A-LABEL: name: v2_to_a2 ; GFX90A: liveins: $vgpr0_vgpr1 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1, implicit $vgpr0_vgpr1 - ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit killed $vgpr0_vgpr1, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $vgpr0_vgpr1, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1 @@ -257,15 +257,15 @@ liveins: $vgpr0_vgpr1_vgpr2 ; GFX908-LABEL: name: v3_to_a3 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 - ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 - ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; GFX90A-LABEL: name: v3_to_a3 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2, implicit $vgpr0_vgpr1_vgpr2 - ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2 - ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $vgpr0_vgpr1_vgpr2, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 @@ -279,17 +279,17 @@ liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX908-LABEL: name: v4_to_a4 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; GFX90A-LABEL: name: v4_to_a4 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 @@ -303,25 +303,25 @@ liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 ; GFX908-LABEL: name: v8_to_a8 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GFX90A-LABEL: name: v8_to_a8 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 - ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr7, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 @@ -335,41 +335,41 @@ liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 ; GFX908-LABEL: name: v16_to_a16 ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec + ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec + ; GFX908: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec + ; GFX908: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec + ; GFX908: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec + ; GFX908: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec + ; GFX908: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec + ; GFX908: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec + ; GFX908: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec + ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GFX90A-LABEL: name: v16_to_a16 ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15 - ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 $vgpr15, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr1, implicit $exec + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr3, implicit $exec + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr4, implicit $exec + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr5, implicit $exec + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 $vgpr6, implicit $exec + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 $vgpr7, implicit $exec + ; GFX90A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 $vgpr8, implicit $exec + ; GFX90A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 $vgpr9, implicit $exec + ; GFX90A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 $vgpr10, implicit $exec + ; GFX90A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 $vgpr11, implicit $exec + ; GFX90A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 $vgpr12, implicit $exec + ; GFX90A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 $vgpr13, implicit $exec + ; GFX90A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 $vgpr14, implicit $exec + ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr15, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15, implicit $exec S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 @@ -403,16 +403,16 @@ liveins: $sgpr0_sgpr1 ; GFX908-LABEL: name: s2_to_a2 ; GFX908: liveins: $sgpr0_sgpr1 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 - ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1 ; GFX90A-LABEL: name: s2_to_a2 ; GFX90A: liveins: $sgpr0_sgpr1 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec, implicit killed $sgpr0_sgpr1 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr1, implicit $exec ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1 $agpr0_agpr1 = COPY killed $sgpr0_sgpr1, implicit $exec @@ -427,20 +427,20 @@ liveins: $sgpr0_sgpr1_sgpr2 ; GFX908-LABEL: name: s3_to_a3 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2 + ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; GFX90A-LABEL: name: s3_to_a3 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr2, implicit $exec ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec @@ -455,24 +455,24 @@ liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX908-LABEL: name: s4_to_a4 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX908: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 ; GFX90A-LABEL: name: s4_to_a4 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr3, implicit $exec ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3 $agpr0_agpr1_agpr2_agpr3 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec @@ -487,32 +487,32 @@ liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX908-LABEL: name: s6_to_a6 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $sgpr5, implicit $exec ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 ; GFX90A-LABEL: name: s6_to_a6 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr5, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr5, implicit $exec ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit $exec @@ -527,40 +527,40 @@ liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX908-LABEL: name: s8_to_a8 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr7, implicit $exec ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 ; GFX90A-LABEL: name: s8_to_a8 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr5, implicit $exec ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr7, implicit $exec ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec @@ -575,72 +575,72 @@ liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX908-LABEL: name: s16_to_a16 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr4, implicit $exec ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr5, implicit $exec ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr7, implicit $exec ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr8, implicit $exec ; GFX908: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec ; GFX908: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr10, implicit $exec ; GFX908: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr11, implicit $exec ; GFX908: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec ; GFX908: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr13, implicit $exec ; GFX908: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr14, implicit $exec ; GFX908: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX908: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GFX90A-LABEL: name: s16_to_a16 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr5, implicit $exec ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr6, implicit $exec ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr7, implicit $exec ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr8, implicit $exec ; GFX90A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr9, implicit $exec ; GFX90A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr10, implicit $exec ; GFX90A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr11, implicit $exec ; GFX90A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr12, implicit $exec ; GFX90A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr13, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr13, implicit $exec ; GFX90A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr14, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr14, implicit $exec ; GFX90A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr15, implicit $exec ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $exec @@ -674,17 +674,17 @@ liveins: $agpr0_agpr1 ; GFX908-LABEL: name: a2_to_a2_kill ; GFX908: liveins: $agpr0_agpr1 - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 - ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 ; GFX90A-LABEL: name: a2_to_a2_kill ; GFX90A: liveins: $agpr0_agpr1 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1 - ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr1_agpr2 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: $agpr3 = V_ACCVGPR_MOV_B32 $agpr2, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr1, implicit $agpr2, implicit $agpr3 @@ -701,20 +701,20 @@ liveins: $agpr4_agpr5_agpr6 ; GFX908-LABEL: name: a3_to_a3_nonoverlap_kill ; GFX908: liveins: $agpr4_agpr5_agpr6 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 ; GFX90A-LABEL: name: a3_to_a3_nonoverlap_kill ; GFX90A: liveins: $agpr4_agpr5_agpr6 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr4_agpr5_agpr6 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr4_agpr5_agpr6 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec, implicit killed $agpr4_agpr5_agpr6 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr6, implicit $exec ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2 $agpr0_agpr1_agpr2 = COPY killed $agpr4_agpr5_agpr6 @@ -729,20 +729,22 @@ liveins: $agpr1_agpr2_agpr3 ; GFX908-LABEL: name: a3_to_a3_overlap_kill ; GFX908: liveins: $agpr1_agpr2_agpr3 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX908: $vgpr4 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr4, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX908: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 ; GFX90A-LABEL: name: a3_to_a3_overlap_kill ; GFX90A: liveins: $agpr1_agpr2_agpr3 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr0_agpr1_agpr2 - ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX90A: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr1_agpr2_agpr3 - ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90A: $agpr0 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90A: $agpr1 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0_agpr1_agpr2, implicit $vgpr1 $agpr0_agpr1_agpr2 = COPY killed $agpr1_agpr2_agpr3 @@ -757,23 +759,25 @@ bb.0: ; GFX908-LABEL: name: a4_to_a4 ; GFX908: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 - ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 ; GFX90A-LABEL: name: a4_to_a4 ; GFX90A: $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 - ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr2_agpr3_agpr4_agpr5 $agpr0_agpr1_agpr2_agpr3 = IMPLICIT_DEF $agpr2_agpr3_agpr4_agpr5 = COPY killed $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -788,23 +792,25 @@ liveins: $agpr0_agpr1_agpr2_agpr3 ; GFX908-LABEL: name: a4_to_a4_overlap ; GFX908: liveins: $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 - ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr3 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr3, implicit $exec, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX908: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 ; GFX90A-LABEL: name: a4_to_a4_overlap ; GFX90A: liveins: $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit-def $agpr2_agpr3_agpr4_agpr5 - ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 $vgpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec + ; GFX90A: $agpr3 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec + ; GFX90A: $agpr2 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 $agpr2_agpr3_agpr4_agpr5 = COPY $agpr0_agpr1_agpr2_agpr3, implicit $exec S_ENDPGM 0, implicit $agpr0, implicit $agpr1, implicit $agpr2, implicit $agpr3, implicit $agpr4, implicit $agpr5 @@ -817,40 +823,40 @@ bb.0: ; GFX908-LABEL: name: a8_to_a8 ; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec + ; GFX908: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec ; GFX908: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec ; GFX908: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec ; GFX908: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec ; GFX908: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX908: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec ; GFX908: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 ; GFX90A-LABEL: name: a8_to_a8 ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec + ; GFX90A: $agpr15 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec ; GFX90A: $agpr14 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec ; GFX90A: $agpr13 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec ; GFX90A: $agpr12 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec ; GFX90A: $agpr11 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX90A: $agpr10 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX90A: $agpr9 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec ; GFX90A: $agpr8 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7 = IMPLICIT_DEF @@ -866,72 +872,72 @@ ; GFX908-LABEL: name: a16_to_a16 ; GFX908: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX908: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec + ; GFX908: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec ; GFX908: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec ; GFX908: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec ; GFX908: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec ; GFX908: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec ; GFX908: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec ; GFX908: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec ; GFX908: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec ; GFX908: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec ; GFX908: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec ; GFX908: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec ; GFX908: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec ; GFX908: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX908: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX908: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec ; GFX908: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 ; GFX90A-LABEL: name: a16_to_a16 ; GFX90A: $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 - ; GFX90A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr15, implicit $exec + ; GFX90A: $agpr31 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr14, implicit $exec ; GFX90A: $agpr30 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr13, implicit $exec ; GFX90A: $agpr29 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr12, implicit $exec ; GFX90A: $agpr28 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr11, implicit $exec ; GFX90A: $agpr27 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr10, implicit $exec ; GFX90A: $agpr26 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr9, implicit $exec ; GFX90A: $agpr25 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr8, implicit $exec ; GFX90A: $agpr24 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr7, implicit $exec ; GFX90A: $agpr23 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr6, implicit $exec ; GFX90A: $agpr22 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr5, implicit $exec ; GFX90A: $agpr21 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr4, implicit $exec ; GFX90A: $agpr20 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec ; GFX90A: $agpr19 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec ; GFX90A: $agpr18 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec ; GFX90A: $agpr17 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec ; GFX90A: $agpr16 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr16_agpr17_agpr18_agpr19_agpr20_agpr21_agpr22_agpr23_agpr24_agpr25_agpr26_agpr27_agpr28_agpr29_agpr30_agpr31 $agpr0_agpr1_agpr2_agpr3_agpr4_agpr5_agpr6_agpr7_agpr8_agpr9_agpr10_agpr11_agpr12_agpr13_agpr14_agpr15 = IMPLICIT_DEF @@ -974,26 +980,18 @@ ; GFX908-LABEL: name: copy_sgpr_to_agpr_tuple ; GFX908: liveins: $agpr0, $sgpr2_sgpr3 ; GFX908: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3 ; GFX90A: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3 S_NOP 0, implicit-def dead $sgpr0_sgpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec @@ -1009,26 +1007,18 @@ ; GFX908-LABEL: name: copy_sgpr_to_agpr_tuple_kill ; GFX908: liveins: $agpr0, $sgpr2_sgpr3 ; GFX908: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; GFX90A-LABEL: name: copy_sgpr_to_agpr_tuple_kill ; GFX90A: liveins: $agpr0, $sgpr2_sgpr3 ; GFX90A: S_NOP 0, implicit-def dead $sgpr0_sgpr1 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr3, implicit $exec + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $sgpr0_sgpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $sgpr0_sgpr1_sgpr2_sgpr3, implicit $exec @@ -1045,26 +1035,18 @@ ; GFX908-LABEL: name: copy_agpr_to_agpr_tuple ; GFX908: liveins: $agpr0, $agpr2_agpr3 ; GFX908: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple ; GFX90A: liveins: $agpr0, $agpr2_agpr3 ; GFX90A: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr0, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7, implicit $agpr0_agpr1_agpr2_agpr3 S_NOP 0, implicit-def dead $agpr0_agpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable $agpr0_agpr1_agpr2_agpr3, implicit $exec @@ -1081,26 +1063,18 @@ ; GFX908-LABEL: name: copy_agpr_to_agpr_tuple_kill ; GFX908: liveins: $agpr0, $agpr2_agpr3 ; GFX908: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX908: $vgpr2 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr2, implicit $exec - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 - ; GFX908: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX908: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr1, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX908: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 ; GFX90A-LABEL: name: copy_agpr_to_agpr_tuple_kill ; GFX90A: liveins: $agpr0, $agpr2_agpr3 ; GFX90A: S_NOP 0, implicit-def dead $agpr0_agpr1 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit-def $agpr4_agpr5_agpr6_agpr7 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr1, implicit $exec, implicit $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr5 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 killed $agpr0, implicit $exec, implicit killed $agpr0_agpr1_agpr2_agpr3 - ; GFX90A: $agpr4 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec + ; GFX90A: $agpr7 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90A: $agpr6 = V_ACCVGPR_WRITE_B32_e64 killed $vgpr0, implicit $exec, implicit $exec ; GFX90A: S_ENDPGM 0, implicit $agpr4_agpr5_agpr6_agpr7 S_NOP 0, implicit-def dead $agpr0_agpr1 renamable $agpr4_agpr5_agpr6_agpr7 = COPY renamable killed $agpr0_agpr1_agpr2_agpr3, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -3345,8 +3345,8 @@ ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB18_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -3762,8 +3762,8 @@ ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB20_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4174,8 +4174,8 @@ ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB22_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) @@ -4584,8 +4584,8 @@ ; GFX8-NEXT: s_and_saveexec_b64 s[2:3], vcc ; GFX8-NEXT: s_cbranch_execz BB24_2 ; GFX8-NEXT: ; %bb.1: -; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v2, 0 +; GFX8-NEXT: v_mov_b32_e32 v0, 5 ; GFX8-NEXT: v_mov_b32_e32 v1, 0 ; GFX8-NEXT: s_mov_b32 m0, -1 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/bitreverse.ll b/llvm/test/CodeGen/AMDGPU/bitreverse.ll --- a/llvm/test/CodeGen/AMDGPU/bitreverse.ll +++ b/llvm/test/CodeGen/AMDGPU/bitreverse.ll @@ -102,11 +102,11 @@ ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: flat_load_ushort v0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GISEL-NEXT: v_lshrrev_b32_e32 v2, 16, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: flat_store_short v[0:1], v2 ; GISEL-NEXT: s_endpgm %val = load i16, i16 addrspace(1)* %valptr @@ -202,10 +202,10 @@ ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dword v0, v[0:1] +; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v2, v0 ; GISEL-NEXT: v_mov_b32_e32 v0, s2 -; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: flat_store_dword v[0:1], v2 ; GISEL-NEXT: s_endpgm %tid = call i32 @llvm.amdgcn.workitem.id.x() @@ -306,13 +306,13 @@ ; GISEL-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; GISEL-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GISEL-NEXT: s_waitcnt lgkmcnt(0) +; GISEL-NEXT: v_mov_b32_e32 v3, s3 ; GISEL-NEXT: v_mov_b32_e32 v0, s0 ; GISEL-NEXT: v_mov_b32_e32 v1, s1 ; GISEL-NEXT: v_add_u32_e32 v0, vcc, v0, v2 ; GISEL-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GISEL-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; GISEL-NEXT: v_mov_b32_e32 v2, s2 -; GISEL-NEXT: v_mov_b32_e32 v3, s3 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v0, v0 ; GISEL-NEXT: v_bfrev_b32_e32 v1, v1 @@ -541,9 +541,9 @@ ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: v_bfrev_b32_e32 v4, v1 ; GISEL-NEXT: v_bfrev_b32_e32 v5, v0 -; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_bfrev_b32_e32 v6, v3 ; GISEL-NEXT: v_bfrev_b32_e32 v7, v2 +; GISEL-NEXT: v_mov_b32_e32 v0, s2 ; GISEL-NEXT: v_mov_b32_e32 v1, s3 ; GISEL-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GISEL-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll --- a/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll +++ b/llvm/test/CodeGen/AMDGPU/chain-hi-to-lo.ll @@ -237,9 +237,9 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_load_short_d16_hi v0, v[1:2], off ; GFX10-NEXT: s_waitcnt vmcnt(0) @@ -305,9 +305,9 @@ ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 2 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 +; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: flat_load_ushort v0, v[0:1] ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-NEXT: flat_load_short_d16_hi v0, v[1:2] ; GFX10-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -54,12 +54,12 @@ ; GFX10-NEXT: s_add_u32 s0, s2, 8 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: s_add_u32 s6, s2, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: s_addc_u32 s7, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_add_u32 s0, s2, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: v_mov_b32_e32 v5, s7 @@ -72,13 +72,13 @@ ; GFX10-NEXT: flat_load_dword v11, v[6:7] ; GFX10-NEXT: s_add_u32 s0, s4, 8 ; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_add_u32 s0, s4, 16 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: s_addc_u32 s1, s5, 0 -; GFX10-NEXT: s_add_u32 s2, s4, 24 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_add_u32 s2, s4, 24 ; GFX10-NEXT: s_addc_u32 s3, s5, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, s1 ; GFX10-NEXT: v_mov_b32_e32 v4, s0 @@ -168,8 +168,8 @@ ; GFX10-NEXT: s_addc_u32 s7, s3, 0 ; GFX10-NEXT: s_add_u32 s0, s2, 24 ; GFX10-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-NEXT: s_addc_u32 s1, s3, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s6 ; GFX10-NEXT: v_mov_b32_e32 v5, s7 ; GFX10-NEXT: flat_load_dword v6, v[2:3] @@ -182,18 +182,18 @@ ; GFX10-NEXT: s_add_u32 s0, s4, 8 ; GFX10-NEXT: s_addc_u32 s1, s5, 0 ; GFX10-NEXT: s_add_u32 s2, s4, 16 -; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: s_addc_u32 s3, s5, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 ; GFX10-NEXT: v_mov_b32_e32 v2, s0 ; GFX10-NEXT: s_add_u32 s0, s4, 24 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v3, s1 ; GFX10-NEXT: v_mov_b32_e32 v5, s3 -; GFX10-NEXT: s_addc_u32 s1, s5, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, s2 +; GFX10-NEXT: s_addc_u32 s1, s5, 0 +; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX10-NEXT: v_add_nc_u32_e32 v11, 1, v6 -; GFX10-NEXT: v_mov_b32_e32 v7, s1 ; GFX10-NEXT: v_mov_b32_e32 v6, s0 ; GFX10-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) ; GFX10-NEXT: flat_store_dword v[0:1], v8 diff --git a/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir b/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir --- a/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir +++ b/llvm/test/CodeGen/AMDGPU/copy-overlap-vgpr-kill.mir @@ -14,9 +14,9 @@ ; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy ; CHECK: liveins: $sgpr30_sgpr31, $vgpr1_vgpr2_vgpr3 - ; CHECK: $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr1_vgpr2_vgpr3 - ; CHECK: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr1_vgpr2_vgpr3 - ; CHECK: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr1_vgpr2_vgpr3 + ; CHECK: $vgpr0 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; CHECK: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec ; CHECK: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec ; CHECK: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 renamable $vgpr0_vgpr1_vgpr2 = COPY killed renamable $vgpr1_vgpr2_vgpr3 @@ -34,9 +34,9 @@ ; CHECK-LABEL: name: overlapping_copy_kill_undef_reg_after_copy_1 ; CHECK: liveins: $sgpr30_sgpr31, $vgpr2_vgpr3_vgpr4 - ; CHECK: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr2_vgpr3_vgpr4 - ; CHECK: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4 - ; CHECK: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4 + ; CHECK: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; CHECK: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec ; CHECK: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec ; CHECK: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 renamable $vgpr0_vgpr1_vgpr2 = COPY killed renamable $vgpr2_vgpr3_vgpr4 @@ -54,9 +54,9 @@ ; CHECK-LABEL: name: nonoverlapping_copy_kill ; CHECK: liveins: $sgpr30_sgpr31, $vgpr3_vgpr4_vgpr5 - ; CHECK: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr3_vgpr4_vgpr5 - ; CHECK: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr3_vgpr4_vgpr5 - ; CHECK: $vgpr2 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit killed $vgpr3_vgpr4_vgpr5 + ; CHECK: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; CHECK: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; CHECK: $vgpr2 = V_MOV_B32_e32 killed $vgpr5, implicit $exec ; CHECK: renamable $vgpr1 = nofpexcept V_MUL_F32_e32 0, $vgpr1, implicit $mode, implicit $exec ; CHECK: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2 renamable $vgpr0_vgpr1_vgpr2 = COPY killed renamable $vgpr3_vgpr4_vgpr5 @@ -74,10 +74,10 @@ ; CHECK-LABEL: name: overlapping_copy_kill_half_s128 ; CHECK: liveins: $sgpr30_sgpr31, $vgpr2_vgpr3_vgpr4_vgpr5 - ; CHECK: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; CHECK: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; CHECK: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; CHECK: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 + ; CHECK: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; CHECK: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; CHECK: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; CHECK: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec ; CHECK: renamable $vgpr1 = V_OR_B32_e32 1, $vgpr1, implicit $exec ; CHECK: S_SETPC_B64 $sgpr30_sgpr31, implicit $vgpr0, implicit $vgpr1, implicit $vgpr2, implicit $vgpr3 renamable $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed renamable $vgpr2_vgpr3_vgpr4_vgpr5 diff --git a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir --- a/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir +++ b/llvm/test/CodeGen/AMDGPU/copy_phys_vgpr64.mir @@ -11,15 +11,15 @@ liveins: $vgpr2_vgpr3 ; GFX908-LABEL: name: copy_v64_to_v64 ; GFX908: liveins: $vgpr2_vgpr3 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v64_to_v64 ; GFX90A: liveins: $vgpr2_vgpr3 - ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $vgpr2_vgpr3, 12, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v64_to_v64 ; GFX10: liveins: $vgpr2_vgpr3 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec ... @@ -31,15 +31,15 @@ liveins: $sgpr2_sgpr3 ; GFX908-LABEL: name: copy_s64_to_v64 ; GFX908: liveins: $sgpr2_sgpr3 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_s64_to_v64 ; GFX90A: liveins: $sgpr2_sgpr3 - ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $sgpr2_sgpr3, 12, killed $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_s64_to_v64 ; GFX10: liveins: $sgpr2_sgpr3 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $sgpr2_sgpr3 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit killed $sgpr2_sgpr3, implicit $exec + ; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 killed $sgpr3, implicit $exec, implicit $exec $vgpr0_vgpr1 = COPY killed $sgpr2_sgpr3, implicit $exec ... @@ -51,16 +51,16 @@ liveins: $agpr2_agpr3 ; GFX908-LABEL: name: copy_a64_to_v64 ; GFX908: liveins: $agpr2_agpr3 - ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX908: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX908: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_a64_to_v64 ; GFX90A: liveins: $agpr2_agpr3 - ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX90A: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX90A: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_a64_to_v64 ; GFX10: liveins: $agpr2_agpr3 - ; GFX10: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $agpr2_agpr3 - ; GFX10: $vgpr1 = V_ACCVGPR_READ_B32_e64 $agpr3, implicit $exec, implicit killed $agpr2_agpr3, implicit $exec + ; GFX10: $vgpr0 = V_ACCVGPR_READ_B32_e64 $agpr2, implicit $exec + ; GFX10: $vgpr1 = V_ACCVGPR_READ_B32_e64 killed $agpr3, implicit $exec, implicit $exec $vgpr0_vgpr1 = COPY killed $agpr2_agpr3, implicit $exec ... @@ -72,20 +72,20 @@ liveins: $vgpr2_vgpr3_vgpr4_vgpr5 ; GFX908-LABEL: name: copy_v128_to_v128_fwd ; GFX908: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v128_to_v128_fwd ; GFX90A: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr4_vgpr5, 12, $vgpr4_vgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v128_to_v128_fwd ; GFX10: liveins: $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr2_vgpr3_vgpr4_vgpr5, implicit $exec ... @@ -97,20 +97,20 @@ liveins: $vgpr0_vgpr1_vgpr2_vgpr3 ; GFX908-LABEL: name: copy_v128_to_v128_back ; GFX908: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX908: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX908: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v128_to_v128_back ; GFX90A: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5 - ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX90A: $vgpr4_vgpr5 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $vgpr0_vgpr1, 12, $vgpr0_vgpr1, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v128_to_v128_back ; GFX10: liveins: $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr2_vgpr3_vgpr4_vgpr5, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec + ; GFX10: $vgpr5 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX10: $vgpr4 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr1, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec, implicit $exec $vgpr2_vgpr3_vgpr4_vgpr5 = COPY killed $vgpr0_vgpr1_vgpr2_vgpr3, implicit $exec ... @@ -122,19 +122,19 @@ liveins: $vgpr4_vgpr5_vgpr6 ; GFX908-LABEL: name: copy_v96_to_v96 ; GFX908: liveins: $vgpr4_vgpr5_vgpr6 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v96_to_v96 ; GFX90A: liveins: $vgpr4_vgpr5_vgpr6 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec + ; GFX90A: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v96_to_v96 ; GFX10: liveins: $vgpr4_vgpr5_vgpr6 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr4_vgpr5_vgpr6 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec, implicit $vgpr4_vgpr5_vgpr6 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr6, implicit $exec, implicit killed $vgpr4_vgpr5_vgpr6, implicit $exec + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr4, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr5, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 killed $vgpr6, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr4_vgpr5_vgpr6, implicit $exec ... @@ -146,15 +146,13 @@ liveins: $vgpr3 ; GFX908-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX908: liveins: $vgpr3 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX90A: liveins: $vgpr3 - ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $vgpr2_vgpr3, 12, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub0 ; GFX10: liveins: $vgpr3 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 killed $vgpr3, implicit $exec, implicit $exec $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec ... @@ -166,15 +164,13 @@ liveins: $vgpr2 ; GFX908-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX908: liveins: $vgpr2 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX90A: liveins: $vgpr2 - ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, killed $vgpr2_vgpr3, 12, killed $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v64_to_v64_undef_sub1 ; GFX10: liveins: $vgpr2 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr2_vgpr3 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit killed $vgpr2_vgpr3, implicit $exec + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit $exec $vgpr0_vgpr1 = COPY killed $vgpr2_vgpr3, implicit $exec ... @@ -186,20 +182,20 @@ liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX908-LABEL: name: copy_s128_to_v128_killed ; GFX908: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX908: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec + ; GFX908: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec ; GFX90A-LABEL: name: copy_s128_to_v128_killed ; GFX90A: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3 - ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, $sgpr6_sgpr7, 12, $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr4_sgpr5, 12, $sgpr4_sgpr5, 0, 0, 0, 0, 0, implicit $exec + ; GFX90A: $vgpr2_vgpr3 = V_PK_MOV_B32 8, killed $sgpr6_sgpr7, 12, killed $sgpr6_sgpr7, 0, 0, 0, 0, 0, implicit $exec ; GFX10-LABEL: name: copy_s128_to_v128_killed ; GFX10: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec, implicit $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX10: $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr4, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr5, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr6, implicit $exec + ; GFX10: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -211,16 +207,16 @@ liveins: $vgpr2_vgpr3 ; GFX908-LABEL: name: copy_v64_to_v64_unaligned ; GFX908: liveins: $vgpr2_vgpr3 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v64_to_v64_unaligned ; GFX90A: liveins: $vgpr2_vgpr3 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v64_to_v64_unaligned ; GFX10: liveins: $vgpr2_vgpr3 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $vgpr2_vgpr3 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $vgpr2_vgpr3, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr2, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit $exec $vgpr1_vgpr2 = COPY killed $vgpr2_vgpr3, implicit $exec ... @@ -232,16 +228,16 @@ liveins: $vgpr3_vgpr4 ; GFX908-LABEL: name: copy_v64_unaligned_to_v64 ; GFX908: liveins: $vgpr3_vgpr4 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v64_unaligned_to_v64 ; GFX90A: liveins: $vgpr3_vgpr4 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v64_unaligned_to_v64 ; GFX10: liveins: $vgpr3_vgpr4 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1, implicit $vgpr3_vgpr4 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr4, implicit $exec, implicit killed $vgpr3_vgpr4, implicit $exec + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr3, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 killed $vgpr4, implicit $exec, implicit $exec $vgpr0_vgpr1 = COPY killed $vgpr3_vgpr4, implicit $exec ... @@ -253,22 +249,22 @@ liveins: $vgpr8_vgpr9_vgpr10_vgpr11 ; GFX908-LABEL: name: copy_v128_to_v128_unaligned ; GFX908: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX908: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec + ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec + ; GFX908: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v128_to_v128_unaligned ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX90A: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec + ; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec + ; GFX90A: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v128_to_v128_unaligned ; GFX10: liveins: $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit $vgpr8_vgpr9_vgpr10_vgpr11 - ; GFX10: $vgpr4 = V_MOV_B32_e32 $vgpr11, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec + ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec + ; GFX10: $vgpr4 = V_MOV_B32_e32 killed $vgpr11, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3_vgpr4 = COPY killed $vgpr8_vgpr9_vgpr10_vgpr11, implicit $exec ... @@ -280,22 +276,22 @@ liveins: $vgpr7_vgpr8_vgpr9_vgpr10 ; GFX908-LABEL: name: copy_v128_unaligned_to_v128 ; GFX908: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec + ; GFX908: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v128_unaligned_to_v128 ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec + ; GFX90A: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v128_unaligned_to_v128 ; GFX10: liveins: $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr7_vgpr8_vgpr9_vgpr10 - ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec + ; GFX10: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2_vgpr3 = COPY killed $vgpr7_vgpr8_vgpr9_vgpr10, implicit $exec ... @@ -307,16 +303,16 @@ liveins: $sgpr8_sgpr9 ; GFX908-LABEL: name: copy_s64_to_v64_unaligned ; GFX908: liveins: $sgpr8_sgpr9 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_s64_to_v64_unaligned ; GFX90A: liveins: $sgpr8_sgpr9 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec + ; GFX90A: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_s64_to_v64_unaligned ; GFX10: liveins: $sgpr8_sgpr9 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2, implicit $sgpr8_sgpr9 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit killed $sgpr8_sgpr9, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 killed $sgpr9, implicit $exec, implicit $exec $vgpr1_vgpr2 = COPY killed $sgpr8_sgpr9, implicit $exec ... @@ -328,22 +324,22 @@ liveins: $sgpr8_sgpr9_sgpr10_sgpr11 ; GFX908-LABEL: name: copy_s128_to_v128_unaligned ; GFX908: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX908: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX908: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec + ; GFX908: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec + ; GFX908: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_s128_to_v128_unaligned ; GFX90A: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX90A: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX90A: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec + ; GFX90A: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec + ; GFX90A: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_s128_to_v128_unaligned ; GFX10: liveins: $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3_vgpr4, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX10: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec, implicit $sgpr8_sgpr9_sgpr10_sgpr11 - ; GFX10: $vgpr4 = V_MOV_B32_e32 $sgpr11, implicit $exec, implicit killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr8, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr9, implicit $exec + ; GFX10: $vgpr3 = V_MOV_B32_e32 $sgpr10, implicit $exec + ; GFX10: $vgpr4 = V_MOV_B32_e32 killed $sgpr11, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3_vgpr4 = COPY killed $sgpr8_sgpr9_sgpr10_sgpr11, implicit $exec ... @@ -355,19 +351,19 @@ liveins: $vgpr8_vgpr9_vgpr10 ; GFX908-LABEL: name: copy_v96_to_v96_unaligned ; GFX908: liveins: $vgpr8_vgpr9_vgpr10 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX908: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec + ; GFX908: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v96_to_v96_unaligned ; GFX90A: liveins: $vgpr8_vgpr9_vgpr10 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX90A: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec + ; GFX90A: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v96_to_v96_unaligned ; GFX10: liveins: $vgpr8_vgpr9_vgpr10 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $vgpr8_vgpr9_vgpr10 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit $vgpr8_vgpr9_vgpr10 - ; GFX10: $vgpr3 = V_MOV_B32_e32 $vgpr10, implicit $exec, implicit killed $vgpr8_vgpr9_vgpr10, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec + ; GFX10: $vgpr3 = V_MOV_B32_e32 killed $vgpr10, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3 = COPY killed $vgpr8_vgpr9_vgpr10, implicit $exec ... @@ -379,19 +375,19 @@ liveins: $vgpr7_vgpr8_vgpr9 ; GFX908-LABEL: name: copy_v96_unaligned_to_v96 ; GFX908: liveins: $vgpr7_vgpr8_vgpr9 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_v96_unaligned_to_v96 ; GFX90A: liveins: $vgpr7_vgpr8_vgpr9 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX90A: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX90A: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_v96_unaligned_to_v96 ; GFX10: liveins: $vgpr7_vgpr8_vgpr9 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $vgpr7_vgpr8_vgpr9 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec, implicit $vgpr7_vgpr8_vgpr9 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $vgpr9, implicit $exec, implicit killed $vgpr7_vgpr8_vgpr9, implicit $exec + ; GFX10: $vgpr0 = V_MOV_B32_e32 $vgpr7, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $vgpr8, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 killed $vgpr9, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $vgpr7_vgpr8_vgpr9, implicit $exec ... @@ -403,19 +399,19 @@ liveins: $sgpr0_sgpr1_sgpr2 ; GFX908-LABEL: name: copy_s96_to_v96 ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX908: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_s96_to_v96 ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX90A: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec + ; GFX90A: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_s96_to_v96 ; GFX10: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX10: $vgpr0 = V_MOV_B32_e32 $sgpr0, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr1, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 killed $sgpr2, implicit $exec, implicit $exec $vgpr0_vgpr1_vgpr2 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec ... @@ -427,18 +423,18 @@ liveins: $sgpr0_sgpr1_sgpr2 ; GFX908-LABEL: name: copy_s96_to_v96_unaligned ; GFX908: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX908: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX908: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX908: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX908: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec + ; GFX908: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; GFX90A-LABEL: name: copy_s96_to_v96_unaligned ; GFX90A: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX90A: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX90A: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX90A: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX90A: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec + ; GFX90A: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec ; GFX10-LABEL: name: copy_s96_to_v96_unaligned ; GFX10: liveins: $sgpr0_sgpr1_sgpr2 - ; GFX10: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr1_vgpr2_vgpr3, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2 - ; GFX10: $vgpr1 = V_MOV_B32_e32 $sgpr0, implicit $exec, implicit killed $sgpr0_sgpr1_sgpr2, implicit $exec + ; GFX10: $vgpr3 = V_MOV_B32_e32 $sgpr2, implicit $exec + ; GFX10: $vgpr2 = V_MOV_B32_e32 $sgpr1, implicit $exec + ; GFX10: $vgpr1 = V_MOV_B32_e32 killed $sgpr0, implicit $exec, implicit $exec $vgpr1_vgpr2_vgpr3 = COPY killed $sgpr0_sgpr1_sgpr2, implicit $exec ... diff --git a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/ctlz_zero_undef.ll @@ -96,10 +96,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -336,12 +336,12 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v0, v0 ; VI-NEXT: v_add_u32_e32 v0, vcc, -16, v0 ; VI-NEXT: v_add_u16_e32 v2, -8, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -741,10 +741,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -821,10 +821,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -899,10 +899,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ubyte v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbh_u32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_byte v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll --- a/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll +++ b/llvm/test/CodeGen/AMDGPU/cttz_zero_undef.ll @@ -93,10 +93,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_dword v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_ffbl_b32_e32 v2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -685,8 +685,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 1 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ubyte v2, v[2:3] @@ -793,16 +793,16 @@ ; VI-NEXT: s_add_u32 s4, s0, 3 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s0, 1 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -927,32 +927,32 @@ ; VI-NEXT: s_add_u32 s4, s0, 5 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s0, 4 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s0, 7 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_add_u32 s4, s0, 6 +; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: v_mov_b32_e32 v6, s4 ; VI-NEXT: s_add_u32 s4, s0, 3 +; VI-NEXT: v_mov_b32_e32 v7, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: v_mov_b32_e32 v8, s4 ; VI-NEXT: s_add_u32 s4, s0, 2 +; VI-NEXT: v_mov_b32_e32 v9, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v11, s5 ; VI-NEXT: v_mov_b32_e32 v10, s4 ; VI-NEXT: s_add_u32 s4, s0, 1 +; VI-NEXT: v_mov_b32_e32 v11, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v13, s5 -; VI-NEXT: v_mov_b32_e32 v15, s1 ; VI-NEXT: v_mov_b32_e32 v12, s4 +; VI-NEXT: v_mov_b32_e32 v15, s1 ; VI-NEXT: v_mov_b32_e32 v14, s0 ; VI-NEXT: flat_load_ubyte v0, v[0:1] ; VI-NEXT: flat_load_ubyte v2, v[2:3] @@ -985,8 +985,8 @@ ; VI-NEXT: v_ffbl_b32_e32 v0, v2 ; VI-NEXT: v_cmp_ne_u64_e32 vcc, 0, v[2:3] ; VI-NEXT: v_min_u32_e32 v0, v0, v4 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_cndmask_b32_e32 v0, 32, v0, vcc +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -1114,16 +1114,16 @@ ; VI-NEXT: s_add_u32 s4, s0, 3 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s0, 1 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -1235,16 +1235,16 @@ ; VI-NEXT: s_add_u32 s4, s0, 3 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s0, 1 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -1359,16 +1359,16 @@ ; VI-NEXT: s_add_u32 s4, s0, 3 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_add_u32 s4, s0, 2 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s0, 1 -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v7, s1 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v7, s1 ; VI-NEXT: v_mov_b32_e32 v6, s0 ; VI-NEXT: flat_load_ubyte v2, v[2:3] ; VI-NEXT: flat_load_ubyte v3, v[4:5] @@ -1564,8 +1564,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 1 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_ubyte v2, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll --- a/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll +++ b/llvm/test/CodeGen/AMDGPU/dag-divergence-atomic.ll @@ -399,8 +399,8 @@ ; CHECK-LABEL: fmin: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_atomic_min_f64 v[0:1], v2, v[0:1], s[0:1] glc @@ -424,8 +424,8 @@ ; CHECK-LABEL: fmax: ; CHECK: ; %bb.0: ; CHECK-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 -; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, 0 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0x3ff00000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: global_atomic_max_f64 v[0:1], v2, v[0:1], s[0:1] glc diff --git a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll --- a/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll +++ b/llvm/test/CodeGen/AMDGPU/ds-sub-offset.ll @@ -85,9 +85,9 @@ ; GFX10-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ds_write_b32 v2, v3 offset:12 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_fmas_f32 v4, s0, s0, s0 @@ -320,8 +320,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v1, s0 ; GFX9-NEXT: v_div_fmas_f32 v2, v1, v1, v1 -; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: ds_write2_b32 v3, v4, v5 offset1:1 ; GFX9-NEXT: global_store_dword v[0:1], v2, off @@ -335,9 +335,9 @@ ; GFX10-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v4, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: v_sub_nc_u32_e32 v2, 0x3fb, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: ds_write2_b32 v2, v3, v4 offset1:1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_div_fmas_f32 v5, s0, s0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/ds_write2.ll b/llvm/test/CodeGen/AMDGPU/ds_write2.ll --- a/llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ b/llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -845,8 +845,8 @@ ; CI-LABEL: store_misaligned64_constant_large_offsets: ; CI: ; %bb.0: ; CI-NEXT: s_mov_b64 s[0:1], 0x7b -; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v2, 0 +; CI-NEXT: v_mov_b32_e32 v0, s0 ; CI-NEXT: v_mov_b32_e32 v1, s1 ; CI-NEXT: s_mov_b32 m0, -1 ; CI-NEXT: ds_write_b64 v2, v[0:1] offset:16384 @@ -856,8 +856,8 @@ ; GFX9-LABEL: store_misaligned64_constant_large_offsets: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_mov_b64 s[0:1], 0x7b -; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 +; GFX9-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:16384 ; GFX9-NEXT: ds_write_b64 v2, v[0:1] offset:32760 @@ -991,8 +991,8 @@ ; GFX9-UNALIGNED-NEXT: s_load_dwordx4 s[0:3], s[2:3], 0x0 ; GFX9-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 -; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-UNALIGNED-NEXT: v_mov_b32_e32 v3, s3 ; GFX9-UNALIGNED-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-UNALIGNED-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll --- a/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll +++ b/llvm/test/CodeGen/AMDGPU/fast-unaligned-load-store.global.ll @@ -66,13 +66,13 @@ ; GFX7-ALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-ALIGNED-NEXT: flat_store_short v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_endpgm @@ -82,13 +82,13 @@ ; GFX7-UNALIGNED-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 1 ; GFX7-UNALIGNED-NEXT: s_waitcnt lgkmcnt(0) -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-UNALIGNED-NEXT: s_add_u32 s2, s0, 2 +; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-UNALIGNED-NEXT: s_addc_u32 s3, s1, 0 ; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2 -; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-UNALIGNED-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-UNALIGNED-NEXT: flat_store_short v[0:1], v2 ; GFX7-UNALIGNED-NEXT: s_endpgm @@ -192,22 +192,22 @@ ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v3, 0 ; GFX7-ALIGNED-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-ALIGNED-NEXT: s_add_u32 s2, s0, 2 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: s_addc_u32 s3, s1, 0 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-ALIGNED-NEXT: s_add_u32 s4, s0, 1 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_addc_u32 s5, s1, 0 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s4 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s5 -; GFX7-ALIGNED-NEXT: s_add_u32 s0, s0, 3 -; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 ; GFX7-ALIGNED-NEXT: s_addc_u32 s1, s1, 0 +; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s0 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s1 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v3 -; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v2, 2 +; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-ALIGNED-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-ALIGNED-NEXT: flat_store_byte v[0:1], v2 ; GFX7-ALIGNED-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll --- a/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll +++ b/llvm/test/CodeGen/AMDGPU/fence-lds-read2-write2.ll @@ -9,17 +9,17 @@ define amdgpu_kernel void @same_address_fence_merge_write2() #0 { ; GCN-LABEL: same_address_fence_merge_write2: ; GCN: ; %bb.0: ; %bb -; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_lshlrev_b32_e32 v2, 3, v0 ; GCN-NEXT: s_mov_b32 s1, 0x40100000 +; GCN-NEXT: s_mov_b32 s0, 0 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: v_add_u32_e32 v3, 0x800, v2 +; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset1:66 ; GCN-NEXT: ds_write2_b64 v2, v[0:1], v[0:1] offset0:132 offset1:198 ; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:8 offset1:74 ; GCN-NEXT: ds_write2_b64 v3, v[0:1], v[0:1] offset0:140 offset1:206 -; GCN-NEXT: s_mov_b32 s1, 0x3ff00000 ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 ; GCN-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -54,6 +54,7 @@ ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_load_dwordx2 s[2:3], s[2:3], 0x0 ; GFX9-PAL-NEXT: s_mov_b32 s0, 0 +; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: s_mov_b32 vcc_hi, 0 ; GFX9-PAL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-PAL-NEXT: s_and_b32 s3, s3, 0xffff @@ -62,7 +63,6 @@ ; GFX9-PAL-NEXT: s_mov_b32 s1, s0 ; GFX9-PAL-NEXT: s_mov_b32 s2, s0 ; GFX9-PAL-NEXT: s_mov_b32 s3, s0 -; GFX9-PAL-NEXT: v_mov_b32_e32 v0, s0 ; GFX9-PAL-NEXT: v_mov_b32_e32 v1, s1 ; GFX9-PAL-NEXT: v_mov_b32_e32 v2, s2 ; GFX9-PAL-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll --- a/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll +++ b/llvm/test/CodeGen/AMDGPU/fp-min-max-global-atomics-gfx10.ll @@ -122,8 +122,8 @@ ; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 ; G_GFX10-NEXT: v_mov_b32_e32 v3, s3 ; G_GFX10-NEXT: global_atomic_fmin_x2 v[0:1], v[0:1], v[2:3], off glc ; G_GFX10-NEXT: s_endpgm @@ -148,8 +148,8 @@ ; G_GFX10-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; G_GFX10-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX10-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 ; G_GFX10-NEXT: v_mov_b32_e32 v1, s1 +; G_GFX10-NEXT: v_mov_b32_e32 v2, s2 ; G_GFX10-NEXT: v_mov_b32_e32 v3, s3 ; G_GFX10-NEXT: global_atomic_fmax_x2 v[0:1], v[0:1], v[2:3], off glc ; G_GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll --- a/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll +++ b/llvm/test/CodeGen/AMDGPU/fp64-atomics-gfx90a.ll @@ -446,8 +446,8 @@ ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_agent: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] @@ -494,8 +494,8 @@ ; GFX90A-LABEL: global_atomic_fadd_f64_noret_pat_flush: ; GFX90A: ; %bb.0: ; %main_body ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x24 -; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 +; GFX90A-NEXT: v_mov_b32_e32 v0, 0 ; GFX90A-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX90A-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX90A-NEXT: global_atomic_add_f64 v2, v[0:1], s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/frem.ll b/llvm/test/CodeGen/AMDGPU/frem.ll --- a/llvm/test/CodeGen/AMDGPU/frem.ll +++ b/llvm/test/CodeGen/AMDGPU/frem.ll @@ -2060,8 +2060,8 @@ ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: s_add_u32 s0, s0, 32 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] ; VI-NEXT: flat_load_dwordx2 v[4:5], v[4:5] @@ -2374,8 +2374,8 @@ ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -2743,8 +2743,8 @@ ; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_add_u32 s0, s0, 64 ; VI-NEXT: s_addc_u32 s1, s1, 0 -; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v1, s7 +; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/fshl.ll b/llvm/test/CodeGen/AMDGPU/fshl.ll --- a/llvm/test/CodeGen/AMDGPU/fshl.ll +++ b/llvm/test/CodeGen/AMDGPU/fshl.ll @@ -112,10 +112,10 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_alignbit_b32 v2, s0, v0, 25 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -190,6 +190,7 @@ ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: s_not_b32 s1, s1 ; VI-NEXT: s_lshr_b32 s7, s5, 1 @@ -203,7 +204,6 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_alignbit_b32 v0, s1, v0, v2 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -295,12 +295,12 @@ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 23 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, 25 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -394,6 +394,7 @@ ; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: s_not_b32 s3, s3 ; VI-NEXT: s_lshr_b32 s11, s7, 1 @@ -419,7 +420,6 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_alignbit_b32 v0, s1, v0, v4 ; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/fshr.ll b/llvm/test/CodeGen/AMDGPU/fshr.ll --- a/llvm/test/CodeGen/AMDGPU/fshr.ll +++ b/llvm/test/CodeGen/AMDGPU/fshr.ll @@ -107,10 +107,10 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_alignbit_b32 v2, s0, v0, 7 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; @@ -179,6 +179,7 @@ ; VI-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x34 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v0, s7 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, v1 @@ -186,7 +187,6 @@ ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, v2 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -264,12 +264,12 @@ ; VI-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_alignbit_b32 v1, s5, v0, 9 ; VI-NEXT: v_alignbit_b32 v0, s4, v2, 7 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm ; @@ -351,6 +351,7 @@ ; VI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x44 ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x54 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: v_mov_b32_e32 v0, s11 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_alignbit_b32 v3, s7, v0, v1 @@ -364,7 +365,6 @@ ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: v_alignbit_b32 v0, s4, v0, v4 ; VI-NEXT: v_mov_b32_e32 v4, s12 -; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/half.ll b/llvm/test/CodeGen/AMDGPU/half.ll --- a/llvm/test/CodeGen/AMDGPU/half.ll +++ b/llvm/test/CodeGen/AMDGPU/half.ll @@ -64,10 +64,10 @@ ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s4, s0, 4 ; SI-NEXT: s_addc_u32 s5, s1, 0 -; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v4, s3 -; SI-NEXT: v_mov_b32_e32 v0, s0 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v3, s5 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_mov_b32_e32 v5, s2 ; SI-NEXT: flat_store_short v[2:3], v4 @@ -81,10 +81,10 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s0, 4 ; VI-NEXT: s_addc_u32 s5, s1, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v4, s3 -; VI-NEXT: v_mov_b32_e32 v0, s0 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v5, s2 ; VI-NEXT: flat_store_short v[2:3], v4 @@ -103,8 +103,8 @@ ; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s0 -; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_mov_b32_e32 v1, s1 +; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_mov_b32_e32 v3, s3 ; SI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; SI-NEXT: s_endpgm @@ -115,8 +115,8 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -131,8 +131,8 @@ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v4, s6 -; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v5, s7 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_mov_b32_e32 v3, s3 @@ -145,8 +145,8 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s6 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s7 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -332,19 +332,19 @@ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x4 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s4, s1, 16 -; SI-NEXT: s_lshr_b32 s5, s0, 16 ; SI-NEXT: s_lshr_b32 s8, s3, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: s_lshr_b32 s5, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; SI-NEXT: s_add_u32 s0, s6, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; SI-NEXT: s_add_u32 s0, s6, 16 ; SI-NEXT: s_addc_u32 s1, s7, 0 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -360,19 +360,19 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s4, s1, 16 -; VI-NEXT: s_lshr_b32 s5, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s3, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v3, s4 ; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: s_lshr_b32 s5, s0, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v7, s8 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s4 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v6, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s2 -; VI-NEXT: s_add_u32 s0, s6, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s1 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 +; VI-NEXT: s_add_u32 s0, s6, 16 ; VI-NEXT: s_addc_u32 s1, s7, 0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -460,15 +460,15 @@ ; SI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: s_lshr_b32 s4, s2, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v0, s3 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: s_add_u32 s2, s0, 16 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v0 -; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_mov_b32_e32 v7, s3 ; SI-NEXT: v_mov_b32_e32 v6, s2 ; SI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] @@ -482,15 +482,15 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: s_lshr_b32 s4, s2, 16 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s3 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v1 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v6, s2 ; VI-NEXT: flat_store_dwordx2 v[6:7], v[4:5] @@ -510,17 +510,17 @@ ; SI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x2 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_lshr_b32 s4, s3, 16 +; SI-NEXT: s_lshr_b32 s5, s2, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v4, s4 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s3 -; SI-NEXT: s_lshr_b32 s5, s2, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s5 ; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; SI-NEXT: s_add_u32 s2, s0, 16 -; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: s_add_u32 s2, s0, 16 +; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_mov_b32_e32 v9, s3 ; SI-NEXT: v_mov_b32_e32 v8, s2 ; SI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -536,17 +536,17 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshr_b32 s5, s3, 16 +; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s5 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s3 -; VI-NEXT: s_lshr_b32 s4, s2, 16 ; VI-NEXT: v_cvt_f32_f16_e32 v0, s2 ; VI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 -; VI-NEXT: s_add_u32 s2, s0, 16 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; VI-NEXT: s_add_u32 s2, s0, 16 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: flat_store_dwordx4 v[8:9], v[4:7] @@ -571,32 +571,33 @@ ; SI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; SI-NEXT: s_lshr_b32 s5, s2, 16 ; SI-NEXT: s_lshr_b32 s8, s1, 16 -; SI-NEXT: s_lshr_b32 s4, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s2 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 -; SI-NEXT: s_add_u32 s0, s6, 48 +; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; SI-NEXT: s_lshr_b32 s4, s0, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 +; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: v_cvt_f64_f32_e32 v[14:15], v0 ; SI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; SI-NEXT: s_add_u32 s0, s6, 48 ; SI-NEXT: s_addc_u32 s1, s7, 0 -; SI-NEXT: v_cvt_f32_f16_e32 v4, s8 -; SI-NEXT: v_mov_b32_e32 v17, s1 -; SI-NEXT: v_mov_b32_e32 v16, s0 -; SI-NEXT: s_add_u32 s0, s6, 32 -; SI-NEXT: v_cvt_f32_f16_e32 v2, s4 ; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; SI-NEXT: s_addc_u32 s1, s7, 0 -; SI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; SI-NEXT: v_mov_b32_e32 v16, s0 +; SI-NEXT: s_add_u32 s0, s6, 32 ; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v4 -; SI-NEXT: v_mov_b32_e32 v13, s1 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v5 +; SI-NEXT: v_mov_b32_e32 v17, s1 +; SI-NEXT: s_addc_u32 s1, s7, 0 +; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 +; SI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_e32 v12, s0 ; SI-NEXT: s_add_u32 s0, s6, 16 +; SI-NEXT: v_mov_b32_e32 v13, s1 ; SI-NEXT: s_addc_u32 s1, s7, 0 -; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v2 ; SI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; SI-NEXT: s_nop 0 ; SI-NEXT: v_mov_b32_e32 v9, s1 @@ -613,37 +614,38 @@ ; VI-NEXT: s_load_dwordx2 s[6:7], s[4:5], 0x0 ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: s_lshr_b32 s4, s0, 16 ; VI-NEXT: s_lshr_b32 s8, s2, 16 +; VI-NEXT: s_lshr_b32 s4, s0, 16 ; VI-NEXT: s_lshr_b32 s9, s3, 16 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s8 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s4 ; VI-NEXT: v_cvt_f32_f16_e32 v5, s9 ; VI-NEXT: v_cvt_f32_f16_e32 v12, s3 ; VI-NEXT: s_lshr_b32 s5, s1, 16 -; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 -; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f32_f16_e32 v8, s2 -; VI-NEXT: s_add_u32 s0, s6, 48 +; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v4 -; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 ; VI-NEXT: v_cvt_f32_f16_e32 v4, s1 +; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v0 +; VI-NEXT: v_cvt_f64_f32_e32 v[14:15], v5 +; VI-NEXT: v_cvt_f32_f16_e32 v0, s0 ; VI-NEXT: v_cvt_f64_f32_e32 v[12:13], v12 +; VI-NEXT: s_add_u32 s0, s6, 48 ; VI-NEXT: s_addc_u32 s1, s7, 0 -; VI-NEXT: v_cvt_f32_f16_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s6, 32 -; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 -; VI-NEXT: s_addc_u32 s1, s7, 0 -; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v1 -; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v4 +; VI-NEXT: v_mov_b32_e32 v17, s1 +; VI-NEXT: s_addc_u32 s1, s7, 0 +; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 +; VI-NEXT: flat_store_dwordx4 v[16:17], v[12:15] +; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: s_add_u32 s0, s6, 16 +; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: s_addc_u32 s1, s7, 0 -; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v0 ; VI-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v9, s1 @@ -937,10 +939,10 @@ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s4, s2, 16 -; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: s_addc_u32 s5, s3, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: v_mov_b32_e32 v4, s2 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -950,6 +952,7 @@ ; SI-NEXT: v_mov_b32_e32 v13, s2 ; SI-NEXT: s_add_u32 s2, s0, 48 ; SI-NEXT: s_addc_u32 s3, s1, 0 +; SI-NEXT: v_mov_b32_e32 v15, s3 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_cvt_f32_f16_e32 v8, v1 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -972,19 +975,18 @@ ; SI-NEXT: v_lshrrev_b32_e32 v11, 16, v2 ; SI-NEXT: v_cvt_f32_f16_e32 v2, v5 ; SI-NEXT: v_cvt_f32_f16_e32 v0, v4 -; SI-NEXT: v_mov_b32_e32 v5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v9, v1 ; SI-NEXT: v_cvt_f32_f16_e32 v13, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v16 ; SI-NEXT: v_cvt_f32_f16_e32 v1, v17 +; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 ; SI-NEXT: v_mov_b32_e32 v4, s0 ; SI-NEXT: s_add_u32 s0, s0, 32 -; SI-NEXT: v_cvt_f32_f16_e32 v11, v11 -; SI-NEXT: s_addc_u32 s1, s1, 0 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 -; SI-NEXT: v_mov_b32_e32 v15, s3 -; SI-NEXT: v_mov_b32_e32 v17, s1 +; SI-NEXT: v_mov_b32_e32 v5, s1 +; SI-NEXT: s_addc_u32 s1, s1, 0 ; SI-NEXT: v_mov_b32_e32 v14, s2 +; SI-NEXT: v_mov_b32_e32 v17, s1 ; SI-NEXT: v_mov_b32_e32 v16, s0 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: flat_store_dwordx4 v[14:15], v[10:13] @@ -996,8 +998,8 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 @@ -1008,10 +1010,10 @@ ; VI-NEXT: v_mov_b32_e32 v19, s3 ; VI-NEXT: v_mov_b32_e32 v18, s2 ; VI-NEXT: s_add_u32 s2, s0, 48 -; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v16, s0 ; VI-NEXT: s_add_u32 s0, s0, 32 +; VI-NEXT: v_mov_b32_e32 v17, s1 ; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v21, s3 ; VI-NEXT: v_mov_b32_e32 v20, s2 @@ -1143,6 +1145,7 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v5, s1 ; VI-NEXT: v_mov_b32_e32 v4, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1152,7 +1155,6 @@ ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v3 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; VI-NEXT: v_mov_b32_e32 v9, s3 ; VI-NEXT: v_mov_b32_e32 v8, s2 ; VI-NEXT: flat_store_dwordx2 v[8:9], v[6:7] ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] @@ -1173,6 +1175,7 @@ ; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; SI-NEXT: s_add_u32 s2, s0, 16 ; SI-NEXT: s_addc_u32 s3, s1, 0 +; SI-NEXT: v_mov_b32_e32 v11, s3 ; SI-NEXT: v_mov_b32_e32 v9, s1 ; SI-NEXT: v_mov_b32_e32 v8, s0 ; SI-NEXT: s_waitcnt vmcnt(0) @@ -1186,7 +1189,6 @@ ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v1 ; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v2 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 -; SI-NEXT: v_mov_b32_e32 v11, s3 ; SI-NEXT: v_mov_b32_e32 v10, s2 ; SI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; SI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -1201,6 +1203,7 @@ ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: s_add_u32 s2, s0, 16 ; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_mov_b32_e32 v11, s3 ; VI-NEXT: v_mov_b32_e32 v9, s1 ; VI-NEXT: v_mov_b32_e32 v8, s0 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -1212,7 +1215,6 @@ ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v6 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v2 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v10 -; VI-NEXT: v_mov_b32_e32 v11, s3 ; VI-NEXT: v_mov_b32_e32 v10, s2 ; VI-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; VI-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -1236,13 +1238,13 @@ ; SI-NEXT: v_mov_b32_e32 v7, s3 ; SI-NEXT: v_mov_b32_e32 v6, s2 ; SI-NEXT: s_add_u32 s2, s0, 32 -; SI-NEXT: v_mov_b32_e32 v13, s1 ; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_mov_b32_e32 v12, s0 ; SI-NEXT: s_add_u32 s0, s0, 16 +; SI-NEXT: v_mov_b32_e32 v13, s1 ; SI-NEXT: v_mov_b32_e32 v15, s3 -; SI-NEXT: s_addc_u32 s1, s1, 0 ; SI-NEXT: v_mov_b32_e32 v14, s2 +; SI-NEXT: s_addc_u32 s1, s1, 0 ; SI-NEXT: s_waitcnt vmcnt(0) ; SI-NEXT: v_lshrrev_b32_e32 v4, 16, v3 ; SI-NEXT: v_cvt_f32_f16_e32 v3, v3 @@ -1285,13 +1287,13 @@ ; VI-NEXT: v_mov_b32_e32 v8, s3 ; VI-NEXT: v_mov_b32_e32 v7, s2 ; VI-NEXT: s_add_u32 s2, s0, 32 -; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: s_add_u32 s0, s0, 16 +; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: v_mov_b32_e32 v15, s3 -; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v14, s2 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_cvt_f32_f16_e32 v9, v0 ; VI-NEXT: v_cvt_f32_f16_sdwa v16, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -1328,8 +1330,8 @@ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v0, s2 -; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_add_u32 s2, s2, 16 +; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: s_addc_u32 s3, s3, 0 ; SI-NEXT: v_mov_b32_e32 v2, s2 ; SI-NEXT: v_mov_b32_e32 v3, s3 @@ -1347,10 +1349,10 @@ ; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_mov_b32_e32 v18, s3 ; SI-NEXT: v_mov_b32_e32 v17, s2 -; SI-NEXT: s_add_u32 s2, s0, 0x70 -; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_mov_b32_e32 v12, s1 ; SI-NEXT: v_mov_b32_e32 v11, s0 +; SI-NEXT: s_add_u32 s2, s0, 0x70 +; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: s_waitcnt vmcnt(1) ; SI-NEXT: v_lshrrev_b32_e32 v8, 16, v7 ; SI-NEXT: v_cvt_f32_f16_e32 v7, v7 @@ -1388,32 +1390,33 @@ ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v9 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 ; SI-NEXT: v_cvt_f32_f16_e32 v8, v10 -; SI-NEXT: v_mov_b32_e32 v14, s3 -; SI-NEXT: v_mov_b32_e32 v13, s2 -; SI-NEXT: s_add_u32 s2, s0, 0x60 ; SI-NEXT: v_cvt_f32_f16_e32 v10, v4 -; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: flat_store_dwordx4 v[11:12], v[0:3] -; SI-NEXT: v_cvt_f32_f16_e32 v12, v5 +; SI-NEXT: s_nop 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v19 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v8 -; SI-NEXT: v_mov_b32_e32 v16, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, v5 +; SI-NEXT: v_mov_b32_e32 v13, s2 +; SI-NEXT: s_add_u32 s2, s0, 0x60 ; SI-NEXT: v_cvt_f32_f16_e32 v19, v20 +; SI-NEXT: v_mov_b32_e32 v14, s3 +; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_mov_b32_e32 v15, s2 ; SI-NEXT: s_add_u32 s2, s0, 0x50 -; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: v_cvt_f64_f32_e32 v[8:9], v6 ; SI-NEXT: v_cvt_f64_f32_e32 v[10:11], v10 -; SI-NEXT: s_add_u32 s0, s0, 64 +; SI-NEXT: v_mov_b32_e32 v16, s3 +; SI-NEXT: s_addc_u32 s3, s1, 0 ; SI-NEXT: flat_store_dwordx4 v[13:14], v[0:3] -; SI-NEXT: s_addc_u32 s1, s1, 0 +; SI-NEXT: s_add_u32 s0, s0, 64 ; SI-NEXT: v_cvt_f64_f32_e32 v[0:1], v7 ; SI-NEXT: v_cvt_f64_f32_e32 v[2:3], v12 ; SI-NEXT: v_cvt_f64_f32_e32 v[4:5], v21 ; SI-NEXT: v_cvt_f64_f32_e32 v[6:7], v19 +; SI-NEXT: s_addc_u32 s1, s1, 0 ; SI-NEXT: v_mov_b32_e32 v18, s3 -; SI-NEXT: v_mov_b32_e32 v13, s1 ; SI-NEXT: v_mov_b32_e32 v17, s2 +; SI-NEXT: v_mov_b32_e32 v13, s1 ; SI-NEXT: v_mov_b32_e32 v12, s0 ; SI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] ; SI-NEXT: flat_store_dwordx4 v[17:18], v[0:3] @@ -1425,8 +1428,8 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1444,10 +1447,10 @@ ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v18, s3 ; VI-NEXT: v_mov_b32_e32 v17, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x70 ; VI-NEXT: v_mov_b32_e32 v12, s1 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v11, s0 +; VI-NEXT: s_add_u32 s2, s0, 0x70 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v8, v7 ; VI-NEXT: v_cvt_f32_f16_sdwa v9, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 @@ -1459,7 +1462,7 @@ ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: s_waitcnt vmcnt(1) ; VI-NEXT: v_cvt_f32_f16_e32 v10, v0 -; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: v_mov_b32_e32 v13, s2 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[6:9] @@ -1477,32 +1480,32 @@ ; VI-NEXT: v_cvt_f32_f16_sdwa v8, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_sdwa v17, v0 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v0, v3 -; VI-NEXT: v_mov_b32_e32 v13, s2 -; VI-NEXT: s_add_u32 s2, s0, 0x60 ; VI-NEXT: flat_store_dwordx4 v[11:12], v[4:7] -; VI-NEXT: s_addc_u32 s3, s1, 0 +; VI-NEXT: v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[5:6], v8 ; VI-NEXT: v_cvt_f32_f16_e32 v8, v2 ; VI-NEXT: v_cvt_f32_f16_sdwa v2, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f32_f16_e32 v7, v1 -; VI-NEXT: v_cvt_f32_f16_sdwa v12, v1 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:WORD_1 ; VI-NEXT: v_cvt_f64_f32_e32 v[3:4], v0 -; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: s_add_u32 s2, s0, 0x60 +; VI-NEXT: v_mov_b32_e32 v14, s3 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v15, s2 ; VI-NEXT: s_add_u32 s2, s0, 0x50 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_cvt_f64_f32_e32 v[0:1], v10 ; VI-NEXT: v_cvt_f64_f32_e32 v[8:9], v8 ; VI-NEXT: v_cvt_f64_f32_e32 v[10:11], v2 -; VI-NEXT: s_add_u32 s0, s0, 64 +; VI-NEXT: v_mov_b32_e32 v16, s3 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: flat_store_dwordx4 v[13:14], v[3:6] -; VI-NEXT: s_addc_u32 s1, s1, 0 +; VI-NEXT: s_add_u32 s0, s0, 64 ; VI-NEXT: v_cvt_f64_f32_e32 v[4:5], v7 ; VI-NEXT: v_cvt_f64_f32_e32 v[6:7], v12 ; VI-NEXT: v_cvt_f64_f32_e32 v[2:3], v17 +; VI-NEXT: s_addc_u32 s1, s1, 0 ; VI-NEXT: v_mov_b32_e32 v20, s3 -; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: v_mov_b32_e32 v19, s2 +; VI-NEXT: v_mov_b32_e32 v13, s1 ; VI-NEXT: v_mov_b32_e32 v12, s0 ; VI-NEXT: flat_store_dwordx4 v[15:16], v[8:11] ; VI-NEXT: flat_store_dwordx4 v[19:20], v[4:7] @@ -1591,8 +1594,8 @@ ; SI-NEXT: v_mov_b32_e32 v1, s3 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: flat_store_short v[0:1], v2 -; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_or_b32_e32 v2, v4, v3 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: flat_store_dword v[0:1], v2 ; SI-NEXT: s_endpgm @@ -1607,14 +1610,14 @@ ; VI-NEXT: s_add_u32 s2, s0, 4 ; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) -; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; VI-NEXT: v_cvt_f16_f32_sdwa v3, v1 dst_sel:WORD_1 dst_unused:UNUSED_PAD src0_sel:DWORD ; VI-NEXT: v_cvt_f16_f32_e32 v4, v0 +; VI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_or_b32_e32 v3, v4, v3 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_dword v[0:1], v3 ; VI-NEXT: s_endpgm @@ -1676,10 +1679,10 @@ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s4, s2, 16 -; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: s_addc_u32 s5, s3, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 +; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: v_mov_b32_e32 v4, s2 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; SI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1711,10 +1714,10 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 16 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s4 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; VI-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1750,13 +1753,13 @@ ; SI-NEXT: s_add_u32 s4, s2, 32 ; SI-NEXT: s_addc_u32 s5, s3, 0 ; SI-NEXT: v_mov_b32_e32 v0, s4 -; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_add_u32 s4, s2, 48 -; SI-NEXT: v_mov_b32_e32 v13, s3 +; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: s_addc_u32 s5, s3, 0 ; SI-NEXT: v_mov_b32_e32 v12, s2 -; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: s_add_u32 s2, s2, 16 +; SI-NEXT: v_mov_b32_e32 v13, s3 +; SI-NEXT: v_mov_b32_e32 v4, s4 ; SI-NEXT: v_mov_b32_e32 v5, s5 ; SI-NEXT: s_addc_u32 s3, s3, 0 ; SI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] @@ -1787,11 +1790,11 @@ ; SI-NEXT: v_cvt_f16_f32_e32 v10, v10 ; SI-NEXT: v_cvt_f16_f32_e32 v8, v8 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: v_lshlrev_b32_e32 v18, 16, v1 ; SI-NEXT: v_or_b32_e32 v1, v2, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v7 ; SI-NEXT: v_lshlrev_b32_e32 v7, 16, v16 +; SI-NEXT: v_mov_b32_e32 v5, s3 ; SI-NEXT: v_mov_b32_e32 v4, s2 ; SI-NEXT: v_or_b32_e32 v0, v0, v18 ; SI-NEXT: v_or_b32_e32 v3, v6, v2 @@ -1817,13 +1820,13 @@ ; VI-NEXT: s_add_u32 s4, s2, 32 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_add_u32 s4, s2, 48 -; VI-NEXT: v_mov_b32_e32 v13, s3 +; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v12, s2 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: s_add_u32 s2, s2, 16 +; VI-NEXT: v_mov_b32_e32 v13, s3 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: s_addc_u32 s3, s3, 0 ; VI-NEXT: flat_load_dwordx4 v[0:3], v[0:1] @@ -1897,11 +1900,11 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; VI-NEXT: s_load_dword s2, s[4:5], 0x8 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_lshr_b32 s3, s2, 16 ; VI-NEXT: v_mov_b32_e32 v0, s3 ; VI-NEXT: v_add_f16_e32 v2, s2, v0 ; VI-NEXT: v_mov_b32_e32 v0, s0 -; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm %add = fadd half %a, %b @@ -1963,10 +1966,10 @@ ; SI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s4, s2, 8 -; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: s_addc_u32 s5, s3, 0 -; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v0, s2 ; SI-NEXT: v_mov_b32_e32 v1, s3 +; SI-NEXT: v_mov_b32_e32 v2, s4 ; SI-NEXT: v_mov_b32_e32 v3, s5 ; SI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; SI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] @@ -2006,10 +2009,10 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 8 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: flat_load_dwordx2 v[2:3], v[2:3] @@ -2044,50 +2047,50 @@ ; SI-NEXT: s_lshr_b32 s0, s4, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v8, s0 ; SI-NEXT: s_lshr_b32 s0, s5, 16 -; SI-NEXT: s_lshr_b32 s11, s1, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v0, s10 ; SI-NEXT: s_lshr_b32 s10, s2, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v9, s0 ; SI-NEXT: s_lshr_b32 s0, s6, 16 -; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 +; SI-NEXT: s_lshr_b32 s11, s1, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v2, s10 ; SI-NEXT: s_lshr_b32 s10, s3, 16 ; SI-NEXT: v_cvt_f32_f16_e32 v10, s0 ; SI-NEXT: s_lshr_b32 s0, s7, 16 +; SI-NEXT: v_cvt_f32_f16_e32 v1, s11 ; SI-NEXT: v_cvt_f32_f16_e32 v3, s10 -; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v11, s0 -; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 -; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 +; SI-NEXT: v_cvt_f32_f16_e32 v5, s1 ; SI-NEXT: v_cvt_f32_f16_e32 v6, s2 ; SI-NEXT: v_cvt_f32_f16_e32 v7, s3 +; SI-NEXT: v_cvt_f32_f16_e32 v12, s4 +; SI-NEXT: v_cvt_f32_f16_e32 v13, s5 ; SI-NEXT: v_cvt_f32_f16_e32 v14, s7 ; SI-NEXT: v_cvt_f32_f16_e32 v15, s6 -; SI-NEXT: v_add_f32_e32 v1, v1, v9 -; SI-NEXT: v_add_f32_e32 v0, v0, v8 ; SI-NEXT: v_add_f32_e32 v3, v3, v11 ; SI-NEXT: v_add_f32_e32 v2, v2, v10 -; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 -; SI-NEXT: v_add_f32_e32 v5, v5, v13 -; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 -; SI-NEXT: v_add_f32_e32 v4, v4, v12 +; SI-NEXT: v_add_f32_e32 v1, v1, v9 +; SI-NEXT: v_add_f32_e32 v0, v0, v8 ; SI-NEXT: v_cvt_f16_f32_e32 v3, v3 ; SI-NEXT: v_add_f32_e32 v7, v7, v14 ; SI-NEXT: v_cvt_f16_f32_e32 v2, v2 ; SI-NEXT: v_add_f32_e32 v6, v6, v15 -; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 -; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 +; SI-NEXT: v_cvt_f16_f32_e32 v1, v1 +; SI-NEXT: v_add_f32_e32 v5, v5, v13 +; SI-NEXT: v_cvt_f16_f32_e32 v0, v0 +; SI-NEXT: v_add_f32_e32 v4, v4, v12 ; SI-NEXT: v_cvt_f16_f32_e32 v7, v7 ; SI-NEXT: v_cvt_f16_f32_e32 v6, v6 -; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 -; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_cvt_f16_f32_e32 v5, v5 +; SI-NEXT: v_cvt_f16_f32_e32 v4, v4 ; SI-NEXT: v_lshlrev_b32_e32 v3, 16, v3 ; SI-NEXT: v_lshlrev_b32_e32 v2, 16, v2 +; SI-NEXT: v_lshlrev_b32_e32 v1, 16, v1 +; SI-NEXT: v_lshlrev_b32_e32 v0, 16, v0 +; SI-NEXT: v_or_b32_e32 v3, v7, v3 +; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_or_b32_e32 v1, v5, v1 ; SI-NEXT: v_or_b32_e32 v0, v4, v0 ; SI-NEXT: v_mov_b32_e32 v4, s8 -; SI-NEXT: v_or_b32_e32 v3, v7, v3 -; SI-NEXT: v_or_b32_e32 v2, v6, v2 ; SI-NEXT: v_mov_b32_e32 v5, s9 ; SI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; SI-NEXT: s_endpgm @@ -2098,6 +2101,7 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x10 ; VI-NEXT: s_load_dwordx4 s[4:7], s[4:5], 0x20 ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: s_lshr_b32 s11, s3, 16 ; VI-NEXT: s_lshr_b32 s10, s7, 16 ; VI-NEXT: v_mov_b32_e32 v0, s10 @@ -2131,7 +2135,6 @@ ; VI-NEXT: v_add_f16_e32 v4, s0, v4 ; VI-NEXT: v_or_b32_e32 v0, v4, v0 ; VI-NEXT: v_mov_b32_e32 v4, s8 -; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm %add = fadd <8 x half> %a, %b diff --git a/llvm/test/CodeGen/AMDGPU/idot4s.ll b/llvm/test/CodeGen/AMDGPU/idot4s.ll --- a/llvm/test/CodeGen/AMDGPU/idot4s.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -447,9 +447,9 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ubyte v5, v[2:3] diff --git a/llvm/test/CodeGen/AMDGPU/idot4u.ll b/llvm/test/CodeGen/AMDGPU/idot4u.ll --- a/llvm/test/CodeGen/AMDGPU/idot4u.ll +++ b/llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -430,9 +430,9 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ubyte v5, v[2:3] @@ -607,9 +607,9 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ubyte v5, v[2:3] @@ -754,9 +754,9 @@ ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s4, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_dword v4, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_mov_b32_e32 v3, s1 -; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; GFX8-NEXT: flat_load_ubyte v5, v[2:3] @@ -1518,8 +1518,8 @@ ; GFX8-NEXT: v_mov_b32_e32 v1, s7 ; GFX8-NEXT: v_add_u32_e32 v0, vcc, s6, v2 ; GFX8-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc -; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: flat_load_dword v0, v[0:1] +; GFX8-NEXT: v_mov_b32_e32 v3, s1 ; GFX8-NEXT: v_mov_b32_e32 v2, s0 ; GFX8-NEXT: flat_load_ushort v10, v[2:3] ; GFX8-NEXT: s_waitcnt vmcnt(2) diff --git a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll --- a/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll +++ b/llvm/test/CodeGen/AMDGPU/insert_vector_elt.ll @@ -168,8 +168,8 @@ ; SI-NEXT: s_mov_b32 s5, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -183,8 +183,8 @@ ; VI-NEXT: s_mov_b32 s5, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 -; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 @@ -203,9 +203,9 @@ ; SI-NEXT: s_mov_b32 s6, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 -; SI-NEXT: v_mov_b32_e32 v2, s6 ; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -218,9 +218,9 @@ ; VI-NEXT: s_mov_b32 s6, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 -; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm @@ -238,10 +238,10 @@ ; SI-NEXT: s_mov_b32 s7, 0x40a00000 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_mov_b32_e32 v2, s6 -; SI-NEXT: v_mov_b32_e32 v3, s7 ; SI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; SI-NEXT: s_endpgm ; @@ -253,10 +253,10 @@ ; VI-NEXT: s_mov_b32 s7, 0x40a00000 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 -; VI-NEXT: v_mov_b32_e32 v3, s7 ; VI-NEXT: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 ; VI-NEXT: s_endpgm %vecins = insertelement <4 x float> %a, float 5.000000e+00, i32 3 @@ -1547,10 +1547,10 @@ ; SI-NEXT: BB30_3: ; %if ; SI-NEXT: s_load_dword s7, s[2:3], 0x0 ; SI-NEXT: BB30_4: ; %endif -; SI-NEXT: s_waitcnt lgkmcnt(0) -; SI-NEXT: v_mov_b32_e32 v0, s6 ; SI-NEXT: s_mov_b32 s3, 0x100f000 ; SI-NEXT: s_mov_b32 s2, -1 +; SI-NEXT: v_mov_b32_e32 v0, s6 +; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: v_mov_b32_e32 v1, s7 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm @@ -1571,10 +1571,10 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_load_dword s7, s[2:3], 0x0 ; VI-NEXT: BB30_4: ; %endif -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v0, s6 ; VI-NEXT: s_mov_b32 s3, 0x1100f000 ; VI-NEXT: s_mov_b32 s2, -1 +; VI-NEXT: v_mov_b32_e32 v0, s6 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v1, s7 ; VI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; VI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/kernel-args.ll b/llvm/test/CodeGen/AMDGPU/kernel-args.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-args.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-args.ll @@ -764,8 +764,8 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -828,8 +828,8 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1034,10 +1034,10 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s4, s2, 4 ; VI-NEXT: s_addc_u32 s5, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v4, s1 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s4 ; VI-NEXT: v_mov_b32_e32 v3, s5 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v5, s0 ; VI-NEXT: flat_store_short v[2:3], v4 @@ -1412,8 +1412,8 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -1566,8 +1566,8 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1640,8 +1640,8 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -1716,11 +1716,11 @@ ; VI-NEXT: s_load_dword s1, s[0:1], 0x30 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s2, 4 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_byte v[2:3], v4 ; VI-NEXT: v_mov_b32_e32 v2, s4 @@ -1861,14 +1861,14 @@ ; VI-NEXT: s_load_dword s1, s[0:1], 0x3c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s2, 8 -; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: v_mov_b32_e32 v4, s1 ; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v3, s1 -; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_short v[2:3], v4 ; VI-NEXT: v_mov_b32_e32 v2, s4 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v3, s5 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -2095,14 +2095,14 @@ ; VI-NEXT: s_load_dword s1, s[0:1], 0x54 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v2, s1 ; VI-NEXT: s_addc_u32 s1, s3, 0 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 @@ -2193,16 +2193,16 @@ ; VI-NEXT: s_load_dword s1, s[0:1], 0x54 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_add_u32 s0, s2, 16 -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: s_addc_u32 s1, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s1 -; VI-NEXT: v_mov_b32_e32 v1, s0 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: flat_store_dword v[1:2], v3 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_endpgm @@ -2302,23 +2302,24 @@ ; VI-NEXT: s_add_u32 s8, s2, 16 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s2, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2445,23 +2446,24 @@ ; VI-NEXT: s_add_u32 s8, s2, 16 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_addc_u32 s9, s3, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: v_mov_b32_e32 v4, s2 +; VI-NEXT: s_add_u32 s2, s2, 32 ; VI-NEXT: v_mov_b32_e32 v0, s4 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 -; VI-NEXT: v_mov_b32_e32 v4, s2 -; VI-NEXT: s_add_u32 s2, s2, 32 -; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v2, s2 +; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] +; VI-NEXT: s_nop 0 ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 +; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; VI-NEXT: s_endpgm @@ -2572,8 +2574,8 @@ ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -2826,8 +2828,8 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -3083,9 +3085,9 @@ ; VI-NEXT: s_add_u32 s4, s8, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s9, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, s8 @@ -3194,9 +3196,9 @@ ; VI-NEXT: s_add_u32 s4, s8, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s9, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, s8 @@ -3297,8 +3299,8 @@ ; VI-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v4, s4 -; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v5, s5 +; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 @@ -3756,9 +3758,9 @@ ; VI-NEXT: s_add_u32 s4, s8, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s9, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, s8 @@ -4224,9 +4226,9 @@ ; VI-NEXT: s_add_u32 s12, s16, 48 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: s_addc_u32 s13, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_nop 0 @@ -4234,9 +4236,9 @@ ; VI-NEXT: s_add_u32 s8, s16, 32 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_addc_u32 s9, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_nop 0 @@ -4244,9 +4246,9 @@ ; VI-NEXT: s_add_u32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, s16 @@ -4414,9 +4416,9 @@ ; VI-NEXT: s_add_u32 s12, s16, 48 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: s_addc_u32 s13, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_nop 0 @@ -4424,9 +4426,9 @@ ; VI-NEXT: s_add_u32 s8, s16, 32 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_addc_u32 s9, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_nop 0 @@ -4434,9 +4436,9 @@ ; VI-NEXT: s_add_u32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: v_mov_b32_e32 v4, s16 @@ -5149,8 +5151,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_bfe_i64 s[0:1], s[0:1], 0x10000 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -5412,17 +5414,17 @@ ; VI-NEXT: s_add_u32 s4, s0, 50 ; VI-NEXT: s_addc_u32 s5, s1, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_add_u32 s2, s2, 3 +; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_addc_u32 s3, s3, 0 -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: s_add_u32 s2, s0, 51 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v5, s3 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v6, s2 +; VI-NEXT: v_mov_b32_e32 v7, s3 ; VI-NEXT: s_load_dword s4, s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x28 ; VI-NEXT: flat_load_ubyte v8, v[0:1] @@ -5435,10 +5437,10 @@ ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: flat_load_dwordx2 v[0:1], v[0:1] ; VI-NEXT: v_mov_b32_e32 v2, 0 -; VI-NEXT: s_waitcnt lgkmcnt(0) -; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v3, 0 +; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v7, s4 +; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_store_dword v[2:3], v7 ; VI-NEXT: s_waitcnt vmcnt(0) @@ -5871,11 +5873,11 @@ ; VI-NEXT: s_add_u32 s4, s2, 2 ; VI-NEXT: s_addc_u32 s5, s3, 0 ; VI-NEXT: v_mov_b32_e32 v2, s2 -; VI-NEXT: v_mov_b32_e32 v3, s3 ; VI-NEXT: s_add_u32 s2, s0, 42 ; VI-NEXT: v_mov_b32_e32 v0, s4 -; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v1, s5 +; VI-NEXT: v_mov_b32_e32 v3, s3 +; VI-NEXT: s_addc_u32 s3, s1, 0 ; VI-NEXT: v_mov_b32_e32 v5, s3 ; VI-NEXT: v_mov_b32_e32 v4, s2 ; VI-NEXT: flat_load_ushort v0, v[0:1] @@ -6202,9 +6204,9 @@ ; VI-NEXT: s_add_u32 s12, s16, 48 ; VI-NEXT: v_mov_b32_e32 v1, s13 ; VI-NEXT: s_addc_u32 s13, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v2, s14 ; VI-NEXT: v_mov_b32_e32 v3, s15 +; VI-NEXT: v_mov_b32_e32 v4, s12 ; VI-NEXT: v_mov_b32_e32 v5, s13 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6212,9 +6214,9 @@ ; VI-NEXT: s_add_u32 s8, s16, 32 ; VI-NEXT: v_mov_b32_e32 v1, s9 ; VI-NEXT: s_addc_u32 s9, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v2, s10 ; VI-NEXT: v_mov_b32_e32 v3, s11 +; VI-NEXT: v_mov_b32_e32 v4, s8 ; VI-NEXT: v_mov_b32_e32 v5, s9 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) @@ -6222,9 +6224,9 @@ ; VI-NEXT: s_add_u32 s4, s16, 16 ; VI-NEXT: v_mov_b32_e32 v1, s5 ; VI-NEXT: s_addc_u32 s5, s17, 0 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v2, s6 ; VI-NEXT: v_mov_b32_e32 v3, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; VI-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll --- a/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll +++ b/llvm/test/CodeGen/AMDGPU/lds-atomic-fmin-fmax.ll @@ -615,8 +615,8 @@ ; SI-NEXT: s_add_i32 s5, s1, 64 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s1, 0x40450000 -; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] ; SI-NEXT: v_mov_b32_e32 v4, s5 @@ -648,8 +648,8 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b32 s1, s0, 4 ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 @@ -681,8 +681,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s1, s0, 4 ; VI-NEXT: s_lshl_b32 s0, s0, 3 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] offset:32 @@ -776,8 +776,8 @@ ; G_SI-NEXT: s_add_i32 s2, s2, 4 ; G_SI-NEXT: s_lshl_b32 s5, s2, 3 ; G_SI-NEXT: s_mov_b32 s1, 0x40450000 -; G_SI-NEXT: v_mov_b32_e32 v0, s0 ; G_SI-NEXT: v_mov_b32_e32 v2, s5 +; G_SI-NEXT: v_mov_b32_e32 v0, s0 ; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] @@ -806,13 +806,13 @@ ; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 ; G_GFX7-NEXT: s_add_u32 s8, s8, s3 ; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX7-NEXT: s_mov_b32 s4, 0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: s_add_i32 s0, s2, 4 +; G_GFX7-NEXT: s_mov_b32 s4, 0 ; G_GFX7-NEXT: s_mov_b32 s5, 0x40450000 ; G_GFX7-NEXT: s_lshl_b32 s1, s0, 3 -; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] @@ -840,13 +840,13 @@ ; G_VI-NEXT: s_mov_b32 s91, 0xe80000 ; G_VI-NEXT: s_add_u32 s88, s88, s3 ; G_VI-NEXT: s_addc_u32 s89, s89, 0 -; G_VI-NEXT: s_mov_b32 s4, 0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: s_add_i32 s0, s2, 4 +; G_VI-NEXT: s_mov_b32 s4, 0 ; G_VI-NEXT: s_mov_b32 s5, 0x40450000 ; G_VI-NEXT: s_lshl_b32 s1, s0, 3 -; G_VI-NEXT: v_mov_b32_e32 v0, s4 ; G_VI-NEXT: v_mov_b32_e32 v2, s1 +; G_VI-NEXT: v_mov_b32_e32 v0, s4 ; G_VI-NEXT: v_mov_b32_e32 v1, s5 ; G_VI-NEXT: s_mov_b32 m0, -1 ; G_VI-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] @@ -873,13 +873,13 @@ ; G_GFX9-NEXT: s_mov_b32 s10, -1 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3 -; G_GFX9-NEXT: s_mov_b32 s0, 0 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX9-NEXT: s_mov_b32 s0, 0 ; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000 ; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: s_add_i32 s0, s2, 4 +; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX9-NEXT: s_lshl_b32 s1, s0, 3 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s1 ; G_GFX9-NEXT: ds_min_rtn_f64 v[2:3], v2, v[0:1] @@ -958,8 +958,8 @@ ; SI-NEXT: s_add_i32 s5, s1, 64 ; SI-NEXT: s_mov_b32 s0, 0 ; SI-NEXT: s_mov_b32 s1, 0x40450000 -; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v2, s4 +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] ; SI-NEXT: v_mov_b32_e32 v4, s5 @@ -991,8 +991,8 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_lshl_b32 s1, s0, 4 ; GFX7-NEXT: s_lshl_b32 s0, s0, 3 -; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v2, s0 +; GFX7-NEXT: v_mov_b32_e32 v0, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s3 ; GFX7-NEXT: s_mov_b32 m0, -1 ; GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 @@ -1024,8 +1024,8 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: s_lshl_b32 s1, s0, 4 ; VI-NEXT: s_lshl_b32 s0, s0, 3 -; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v2, s0 +; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: s_mov_b32 m0, -1 ; VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] offset:32 @@ -1119,8 +1119,8 @@ ; G_SI-NEXT: s_add_i32 s2, s2, 4 ; G_SI-NEXT: s_lshl_b32 s5, s2, 3 ; G_SI-NEXT: s_mov_b32 s1, 0x40450000 -; G_SI-NEXT: v_mov_b32_e32 v0, s0 ; G_SI-NEXT: v_mov_b32_e32 v2, s5 +; G_SI-NEXT: v_mov_b32_e32 v0, s0 ; G_SI-NEXT: v_mov_b32_e32 v1, s1 ; G_SI-NEXT: s_mov_b32 m0, -1 ; G_SI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] @@ -1149,13 +1149,13 @@ ; G_GFX7-NEXT: s_mov_b32 s11, 0xe8f000 ; G_GFX7-NEXT: s_add_u32 s8, s8, s3 ; G_GFX7-NEXT: s_addc_u32 s9, s9, 0 -; G_GFX7-NEXT: s_mov_b32 s4, 0 ; G_GFX7-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX7-NEXT: s_add_i32 s0, s2, 4 +; G_GFX7-NEXT: s_mov_b32 s4, 0 ; G_GFX7-NEXT: s_mov_b32 s5, 0x40450000 ; G_GFX7-NEXT: s_lshl_b32 s1, s0, 3 -; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v2, s1 +; G_GFX7-NEXT: v_mov_b32_e32 v0, s4 ; G_GFX7-NEXT: v_mov_b32_e32 v1, s5 ; G_GFX7-NEXT: s_mov_b32 m0, -1 ; G_GFX7-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] @@ -1183,13 +1183,13 @@ ; G_VI-NEXT: s_mov_b32 s91, 0xe80000 ; G_VI-NEXT: s_add_u32 s88, s88, s3 ; G_VI-NEXT: s_addc_u32 s89, s89, 0 -; G_VI-NEXT: s_mov_b32 s4, 0 ; G_VI-NEXT: s_waitcnt lgkmcnt(0) ; G_VI-NEXT: s_add_i32 s0, s2, 4 +; G_VI-NEXT: s_mov_b32 s4, 0 ; G_VI-NEXT: s_mov_b32 s5, 0x40450000 ; G_VI-NEXT: s_lshl_b32 s1, s0, 3 -; G_VI-NEXT: v_mov_b32_e32 v0, s4 ; G_VI-NEXT: v_mov_b32_e32 v2, s1 +; G_VI-NEXT: v_mov_b32_e32 v0, s4 ; G_VI-NEXT: v_mov_b32_e32 v1, s5 ; G_VI-NEXT: s_mov_b32 m0, -1 ; G_VI-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] @@ -1216,13 +1216,13 @@ ; G_GFX9-NEXT: s_mov_b32 s10, -1 ; G_GFX9-NEXT: s_mov_b32 s11, 0xe00000 ; G_GFX9-NEXT: s_add_u32 s8, s8, s3 -; G_GFX9-NEXT: s_mov_b32 s0, 0 ; G_GFX9-NEXT: s_addc_u32 s9, s9, 0 +; G_GFX9-NEXT: s_mov_b32 s0, 0 ; G_GFX9-NEXT: s_mov_b32 s1, 0x40450000 ; G_GFX9-NEXT: v_mov_b32_e32 v0, s0 -; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX9-NEXT: s_waitcnt lgkmcnt(0) ; G_GFX9-NEXT: s_add_i32 s0, s2, 4 +; G_GFX9-NEXT: v_mov_b32_e32 v1, s1 ; G_GFX9-NEXT: s_lshl_b32 s1, s0, 3 ; G_GFX9-NEXT: v_mov_b32_e32 v2, s1 ; G_GFX9-NEXT: ds_max_rtn_f64 v[2:3], v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.dec.ll @@ -389,8 +389,8 @@ } ; GCN-LABEL: {{^}}global_atomic_dec_ret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} @@ -405,8 +405,8 @@ } ; GCN-LABEL: {{^}}global_atomic_dec_noret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_dec_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_dec_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.atomic.inc.ll @@ -240,8 +240,8 @@ } ; GCN-LABEL: {{^}}global_atomic_inc_ret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40 glc{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}} glc{{$}} @@ -256,8 +256,8 @@ } ; GCN-LABEL: {{^}}global_atomic_inc_noret_i64_offset_addr64: -; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; CI: v_mov_b32_e32 v{{[0-9]+}}, 0{{$}} +; GCN: v_mov_b32_e32 v[[KLO:[0-9]+]], 42 ; GCN: v_mov_b32_e32 v[[KHI:[0-9]+]], 0{{$}} ; CI: buffer_atomic_inc_x2 v{{\[}}[[KLO]]:[[KHI]]{{\]}}, v{{\[[0-9]+:[0-9]+\]}}, s{{\[[0-9]+:[0-9]+\]}}, 0 addr64 offset:40{{$}} ; VI: flat_atomic_inc_x2 v{{\[[0-9]+:[0-9]+\]}}, v{{\[}}[[KLO]]:[[KHI]]{{\]}}{{$}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.cvt.pkrtz.ll @@ -22,10 +22,10 @@ ; VI-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; VI-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c ; VI-NEXT: s_waitcnt lgkmcnt(0) +; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: v_mov_b32_e32 v0, s1 ; VI-NEXT: v_cvt_pkrtz_f16_f32 v2, s0, v0 ; VI-NEXT: v_mov_b32_e32 v0, s2 -; VI-NEXT: v_mov_b32_e32 v1, s3 ; VI-NEXT: flat_store_dword v[0:1], v2 ; VI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -76,11 +76,11 @@ ; GFX6789-LABEL: load_1d_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -164,11 +164,11 @@ ; GFX6789-LABEL: load_1d_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -289,12 +289,12 @@ ; GFX6789-LABEL: load_2d_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v7, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v8, v7 ; GFX6789-NEXT: v_mov_b32_e32 v9, v7 ; GFX6789-NEXT: v_mov_b32_e32 v10, v7 ; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v7 ; GFX6789-NEXT: v_mov_b32_e32 v1, v8 ; GFX6789-NEXT: v_mov_b32_e32 v2, v9 @@ -418,13 +418,13 @@ ; GFX6789-LABEL: load_3d_tfe_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -549,13 +549,13 @@ ; GFX6789-LABEL: load_cube_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -678,12 +678,12 @@ ; GFX6789-LABEL: load_1darray_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v7, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v8, v7 ; GFX6789-NEXT: v_mov_b32_e32 v9, v7 ; GFX6789-NEXT: v_mov_b32_e32 v10, v7 ; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v7 ; GFX6789-NEXT: v_mov_b32_e32 v1, v8 ; GFX6789-NEXT: v_mov_b32_e32 v2, v9 @@ -807,13 +807,13 @@ ; GFX6789-LABEL: load_2darray_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -938,13 +938,13 @@ ; GFX6789-LABEL: load_2dmsaa_both: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -1071,14 +1071,14 @@ ; GFX6789-LABEL: load_2darraymsaa_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v9, 0 -; GFX6789-NEXT: v_mov_b32_e32 v8, v3 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v10, v9 ; GFX6789-NEXT: v_mov_b32_e32 v11, v9 ; GFX6789-NEXT: v_mov_b32_e32 v12, v9 ; GFX6789-NEXT: v_mov_b32_e32 v13, v9 +; GFX6789-NEXT: v_mov_b32_e32 v8, v3 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v9 ; GFX6789-NEXT: v_mov_b32_e32 v1, v10 ; GFX6789-NEXT: v_mov_b32_e32 v2, v11 @@ -1202,12 +1202,12 @@ ; GFX6789-LABEL: load_mip_1d_lwe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v7, 0 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v8, v7 ; GFX6789-NEXT: v_mov_b32_e32 v9, v7 ; GFX6789-NEXT: v_mov_b32_e32 v10, v7 ; GFX6789-NEXT: v_mov_b32_e32 v11, v7 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v7 ; GFX6789-NEXT: v_mov_b32_e32 v1, v8 ; GFX6789-NEXT: v_mov_b32_e32 v2, v9 @@ -1331,13 +1331,13 @@ ; GFX6789-LABEL: load_mip_2d_tfe: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v8, 0 -; GFX6789-NEXT: v_mov_b32_e32 v7, v2 -; GFX6789-NEXT: v_mov_b32_e32 v6, v1 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v9, v8 ; GFX6789-NEXT: v_mov_b32_e32 v10, v8 ; GFX6789-NEXT: v_mov_b32_e32 v11, v8 ; GFX6789-NEXT: v_mov_b32_e32 v12, v8 +; GFX6789-NEXT: v_mov_b32_e32 v7, v2 +; GFX6789-NEXT: v_mov_b32_e32 v6, v1 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v8 ; GFX6789-NEXT: v_mov_b32_e32 v1, v9 ; GFX6789-NEXT: v_mov_b32_e32 v2, v10 @@ -1727,10 +1727,10 @@ ; GFX6789-LABEL: load_1d_tfe_V4_dmask3: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v5, 0 -; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: v_mov_b32_e32 v6, v5 ; GFX6789-NEXT: v_mov_b32_e32 v7, v5 ; GFX6789-NEXT: v_mov_b32_e32 v8, v5 +; GFX6789-NEXT: v_mov_b32_e32 v4, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v5 ; GFX6789-NEXT: v_mov_b32_e32 v1, v6 ; GFX6789-NEXT: v_mov_b32_e32 v2, v7 @@ -1807,9 +1807,9 @@ ; GFX6789-LABEL: load_1d_tfe_V4_dmask2: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v4, 0 -; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v5, v4 ; GFX6789-NEXT: v_mov_b32_e32 v6, v4 +; GFX6789-NEXT: v_mov_b32_e32 v3, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v4 ; GFX6789-NEXT: v_mov_b32_e32 v1, v5 ; GFX6789-NEXT: v_mov_b32_e32 v2, v6 @@ -1881,8 +1881,8 @@ ; GFX6789-LABEL: load_1d_tfe_V4_dmask1: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v3, 0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v3 ; GFX6789-NEXT: v_mov_b32_e32 v1, v4 ; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe @@ -1951,8 +1951,8 @@ ; GFX6789-LABEL: load_1d_tfe_V2_dmask1: ; GFX6789: ; %bb.0: ; %main_body ; GFX6789-NEXT: v_mov_b32_e32 v3, 0 -; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v4, v3 +; GFX6789-NEXT: v_mov_b32_e32 v2, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v3 ; GFX6789-NEXT: v_mov_b32_e32 v1, v4 ; GFX6789-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 unorm tfe diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -60,11 +60,11 @@ ; GFX6789-NEXT: s_mov_b64 s[14:15], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 @@ -480,11 +480,11 @@ ; GFX6789-NEXT: s_mov_b64 s[14:15], exec ; GFX6789-NEXT: s_wqm_b64 exec, exec ; GFX6789-NEXT: v_mov_b32_e32 v6, 0 -; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v7, v6 ; GFX6789-NEXT: v_mov_b32_e32 v8, v6 ; GFX6789-NEXT: v_mov_b32_e32 v9, v6 ; GFX6789-NEXT: v_mov_b32_e32 v10, v6 +; GFX6789-NEXT: v_mov_b32_e32 v5, v0 ; GFX6789-NEXT: v_mov_b32_e32 v0, v6 ; GFX6789-NEXT: v_mov_b32_e32 v1, v7 ; GFX6789-NEXT: v_mov_b32_e32 v2, v8 diff --git a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-constant-i16.ll @@ -167,10 +167,10 @@ ; GCN-HSA-NEXT: s_add_u32 s4, s0, 4 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s2 ; GCN-HSA-NEXT: flat_store_short v[2:3], v4 @@ -402,12 +402,12 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 -; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 +; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] ; GCN-HSA-NEXT: s_endpgm @@ -514,10 +514,10 @@ ; GCN-HSA-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -1448,9 +1448,9 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -1576,16 +1576,16 @@ ; GCN-HSA-NEXT: s_ashr_i32 s3, s6, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 ; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -1744,20 +1744,20 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -1953,27 +1953,29 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 +; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 -; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 @@ -1981,8 +1983,6 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -2239,58 +2239,58 @@ ; GCN-HSA-NEXT: s_lshr_b32 s18, s18, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s17 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s33 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -2612,73 +2612,75 @@ ; GCN-HSA-NEXT: s_ashr_i32 s36, s18, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s16, s16 ; GCN-HSA-NEXT: s_sext_i32_i16 s19, s19 ; GCN-HSA-NEXT: s_sext_i32_i16 s18, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: s_sext_i32_i16 s17, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 +; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s33 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] -; GCN-HSA-NEXT: s_sext_i32_i16 s15, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s14, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s13, s13 ; GCN-HSA-NEXT: s_sext_i32_i16 s12, s12 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s11, s11 ; GCN-HSA-NEXT: s_sext_i32_i16 s10, s10 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s26 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s9, s9 ; GCN-HSA-NEXT: s_sext_i32_i16 s8, s8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 +; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_sext_i32_i16 s7, s7 -; GCN-HSA-NEXT: s_sext_i32_i16 s6, s6 +; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 +; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s6 @@ -2686,8 +2688,6 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 -; GCN-HSA-NEXT: s_sext_i32_i16 s5, s5 -; GCN-HSA-NEXT: s_sext_i32_i16 s4, s4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 @@ -3137,57 +3137,58 @@ ; GCN-HSA-NEXT: s_lshr_b32 s50, s50, 16 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v30, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x90 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s67 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s66 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s49 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v34, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s61 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s53 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s68 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s65 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s64 @@ -3206,9 +3207,9 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s56 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s7 +; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s36 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s14 @@ -3226,25 +3227,24 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s16, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 -; GCN-HSA-NEXT: s_add_u32 s6, s16, 48 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s55 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 @@ -3257,9 +3257,9 @@ ; GCN-HSA-NEXT: s_add_u32 s4, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -3267,9 +3267,9 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s22 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 @@ -3860,27 +3860,26 @@ ; GCN-HSA-NEXT: s_ashr_i32 s68, s50, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v30, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: v_mov_b32_e32 v31, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s49, s49 ; GCN-HSA-NEXT: s_sext_i32_i16 s48, s48 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 @@ -3889,13 +3888,13 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s66 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s49 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s65 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCN-HSA-NEXT: s_sext_i32_i16 s43, s43 ; GCN-HSA-NEXT: s_sext_i32_i16 s42, s42 ; GCN-HSA-NEXT: v_mov_b32_e32 v34, s2 @@ -3904,15 +3903,15 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s60 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s43 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s59 +; GCN-HSA-NEXT: v_mov_b32_e32 v35, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: s_sext_i32_i16 s51, s51 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s50, s50 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_sext_i32_i16 s36, s36 ; GCN-HSA-NEXT: s_sext_i32_i16 s39, s39 ; GCN-HSA-NEXT: s_sext_i32_i16 s38, s38 @@ -3951,8 +3950,11 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s55 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s54 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_sext_i32_i16 s27, s27 +; GCN-HSA-NEXT: s_sext_i32_i16 s26, s26 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s54 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s30 @@ -3970,42 +3972,40 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: s_sext_i32_i16 s27, s27 -; GCN-HSA-NEXT: s_sext_i32_i16 s26, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s24 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s20 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s19 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s17 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -4904,8 +4904,8 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -5023,9 +5023,9 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -5167,16 +5167,16 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -5347,20 +5347,20 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -5574,44 +5574,44 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -5887,15 +5887,15 @@ ; GCN-HSA-NEXT: s_bfe_i64 s[16:17], s[16:17], 0x100000 ; GCN-HSA-NEXT: s_add_u32 s26, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s27, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s12 ; GCN-HSA-NEXT: s_add_u32 s12, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s13 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s12 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s13 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -5903,9 +5903,9 @@ ; GCN-HSA-NEXT: s_add_u32 s10, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s18 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s19 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -5914,21 +5914,21 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 0x60 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 +; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 -; GCN-HSA-NEXT: s_add_u32 s8, s0, 64 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_addc_u32 s9, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s31 @@ -5941,9 +5941,9 @@ ; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s15 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -6283,29 +6283,28 @@ ; GCN-HSA-NEXT: s_lshr_b32 s4, s4, 16 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s19 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s17 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[0:3] @@ -6314,71 +6313,72 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s13 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[0:3] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s33 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s11 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s31 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s30 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s29 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s18 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s27 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s14 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s25 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s12 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s23 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 @@ -6867,38 +6867,38 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s45 ; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s45 ; GCN-HSA-NEXT: s_add_u32 s44, s16, 0xb0 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s45 ; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v26, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s45 ; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s45 ; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s44 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s45 ; GCN-HSA-NEXT: s_add_u32 s44, s16, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s56 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s57 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s45 ; GCN-HSA-NEXT: s_addc_u32 s45, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s40 ; GCN-HSA-NEXT: s_add_u32 s40, s16, 0x50 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s41 ; GCN-HSA-NEXT: s_addc_u32 s41, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s38 ; GCN-HSA-NEXT: s_add_u32 s38, s16, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v6, s80 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s81 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s39 -; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v24, s38 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 ; GCN-HSA-NEXT: s_add_u32 s38, s16, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s46 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s47 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s78 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s79 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s39 ; GCN-HSA-NEXT: s_addc_u32 s39, s17, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s48 @@ -6909,7 +6909,6 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s77 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 ; GCN-HSA-NEXT: flat_store_dwordx4 v[28:29], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s50 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s12 @@ -6917,30 +6916,31 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s51 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s74 ; GCN-HSA-NEXT: v_mov_b32_e32 v19, s75 +; GCN-HSA-NEXT: v_mov_b32_e32 v30, s44 ; GCN-HSA-NEXT: v_mov_b32_e32 v31, s45 -; GCN-HSA-NEXT: v_mov_b32_e32 v32, s40 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 -; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s52 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s53 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s54 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s55 +; GCN-HSA-NEXT: v_mov_b32_e32 v32, s40 ; GCN-HSA-NEXT: v_mov_b32_e32 v33, s41 -; GCN-HSA-NEXT: v_mov_b32_e32 v34, s38 -; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s13 +; GCN-HSA-NEXT: s_addc_u32 s13, s17, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s42 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s43 +; GCN-HSA-NEXT: v_mov_b32_e32 v34, s38 ; GCN-HSA-NEXT: v_mov_b32_e32 v35, s39 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s36 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s37 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s34 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s35 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s30 +; GCN-HSA-NEXT: flat_store_dwordx4 v[30:31], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s31 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[32:33], v[20:23] +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s13 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[34:35], v[4:7] @@ -6950,9 +6950,9 @@ ; GCN-HSA-NEXT: s_add_u32 s10, s16, 0xa0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s11 ; GCN-HSA-NEXT: s_addc_u32 s11, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s28 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s29 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -6960,9 +6960,9 @@ ; GCN-HSA-NEXT: s_add_u32 s8, s16, 0x80 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s9 ; GCN-HSA-NEXT: s_addc_u32 s9, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s26 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s27 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s8 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s9 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -6970,9 +6970,9 @@ ; GCN-HSA-NEXT: s_add_u32 s6, s16, 0x60 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s7 ; GCN-HSA-NEXT: s_addc_u32 s7, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s24 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s25 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -6980,9 +6980,9 @@ ; GCN-HSA-NEXT: s_add_u32 s4, s16, 64 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s22 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s23 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_nop 0 @@ -6990,9 +6990,9 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s16, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s17, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s20 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s21 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 diff --git a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll --- a/llvm/test/CodeGen/AMDGPU/load-global-i16.ll +++ b/llvm/test/CodeGen/AMDGPU/load-global-i16.ll @@ -226,8 +226,8 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 4 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: flat_store_short v[4:5], v1 @@ -525,18 +525,18 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -673,8 +673,8 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 @@ -683,8 +683,8 @@ ; GCN-HSA-NEXT: s_add_u32 s0, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s1, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: flat_store_dwordx4 v[10:11], v[4:7] @@ -1855,8 +1855,8 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 @@ -2005,8 +2005,8 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 @@ -2165,23 +2165,23 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_mov_b32 s4, 0xffff +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 @@ -2403,8 +2403,8 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 @@ -2412,16 +2412,16 @@ ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(1) @@ -2678,14 +2678,14 @@ ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 @@ -2705,6 +2705,7 @@ ; GCN-HSA-NEXT: s_addc_u32 s11, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s12, s0, 32 ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v21, s6 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) @@ -2722,39 +2723,37 @@ ; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s9 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v19, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v17, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v18, s14, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v16, s14, v6 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v20, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v6, s14, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v4, s14, v12 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s2 -; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v14 ; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v14 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[19:20], v[15:18] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v18, 16, v9 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v16, 16, v8 ; GCN-HSA-NEXT: v_and_b32_e32 v17, s14, v9 ; GCN-HSA-NEXT: v_and_b32_e32 v15, s14, v8 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v5, s14, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v3, s14, v2 @@ -2762,6 +2761,7 @@ ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v12, 16, v10 ; GCN-HSA-NEXT: v_and_b32_e32 v13, s14, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v11, s14, v10 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[15:18] ; GCN-HSA-NEXT: flat_store_dwordx4 v[21:22], v[11:14] @@ -3089,10 +3089,10 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v4, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -3101,8 +3101,8 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] @@ -3117,37 +3117,36 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 16, v3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v18, v3, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v16, v2, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[16:19] ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v5 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v4 ; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v4, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v7 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v6 ; GCN-HSA-NEXT: v_bfe_i32 v2, v7, 0, 16 @@ -3157,15 +3156,15 @@ ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v8 ; GCN-HSA-NEXT: v_bfe_i32 v6, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v11 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v10 ; GCN-HSA-NEXT: v_bfe_i32 v2, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v10, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[0:3] ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) @@ -3173,6 +3172,7 @@ ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v12 ; GCN-HSA-NEXT: v_bfe_i32 v6, v13, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v12, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v14 @@ -3629,36 +3629,36 @@ ; GCN-HSA-NEXT: s_add_u32 s4, s2, s6 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, s7 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] -; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] +; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: s_add_u32 s10, s2, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: s_addc_u32 s11, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v29, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] @@ -3677,15 +3677,15 @@ ; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v5 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v4 -; GCN-HSA-NEXT: v_and_b32_e32 v26, s9, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v24, s9, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: s_add_u32 s16, s0, 0xb0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v5 +; GCN-HSA-NEXT: v_and_b32_e32 v26, s9, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s17 ; GCN-HSA-NEXT: s_addc_u32 s17, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[24:27] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s16 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v27, 16, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v25, 16, v6 @@ -3717,34 +3717,33 @@ ; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s10 -; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v11 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v10 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v11 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s11 -; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v27, s5 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v25, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v24, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v26, s4 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(6) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v33 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v32 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v33 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v32 +; GCN-HSA-NEXT: v_mov_b32_e32 v27, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v26, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v35 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v34 ; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v35 ; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v34 ; GCN-HSA-NEXT: flat_store_dwordx4 v[24:25], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[26:27], v[4:7] -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v29 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v28 ; GCN-HSA-NEXT: v_and_b32_e32 v10, s9, v29 @@ -3756,13 +3755,13 @@ ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v30 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v31 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v30 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, s8 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v20 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v21 @@ -3774,38 +3773,39 @@ ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v22 ; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v23 ; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v22 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] -; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v15 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v3, 16, v15 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v13 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v12 -; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v14 ; GCN-HSA-NEXT: v_and_b32_e32 v6, s9, v13 ; GCN-HSA-NEXT: v_and_b32_e32 v4, s9, v12 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v17 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v16 -; GCN-HSA-NEXT: v_and_b32_e32 v14, s9, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v12, s9, v16 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, s6 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v14 +; GCN-HSA-NEXT: v_and_b32_e32 v2, s9, v15 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s9, v14 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v17 +; GCN-HSA-NEXT: v_and_b32_e32 v14, s9, v17 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v18 ; GCN-HSA-NEXT: v_and_b32_e32 v10, s9, v19 ; GCN-HSA-NEXT: v_and_b32_e32 v8, s9, v18 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] ; GCN-HSA-NEXT: s_nop 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 @@ -4410,18 +4410,18 @@ ; GCN-HSA-NEXT: s_add_u32 s4, s2, s9 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_add_u32 s4, s2, s10 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[2:3] ; GCN-HSA-NEXT: s_add_u32 s4, s2, s8 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -4430,8 +4430,8 @@ ; GCN-HSA-NEXT: s_add_u32 s6, s2, 48 ; GCN-HSA-NEXT: s_addc_u32 s7, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s7 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s6 ; GCN-HSA-NEXT: flat_load_dwordx4 v[16:19], v[16:17] ; GCN-HSA-NEXT: flat_load_dwordx4 v[20:23], v[20:21] @@ -4440,8 +4440,8 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v29, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v28, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v33, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v32, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[28:31], v[28:29] ; GCN-HSA-NEXT: flat_load_dwordx4 v[32:35], v[32:33] @@ -4450,32 +4450,32 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v13 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v12 -; GCN-HSA-NEXT: v_bfe_i32 v26, v13, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v24, v12, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v13 +; GCN-HSA-NEXT: v_bfe_i32 v26, v13, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[24:27] -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v27, 16, v15 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v25, 16, v14 ; GCN-HSA-NEXT: v_bfe_i32 v26, v15, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v24, v14, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[24:27] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v8 -; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v12, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xd0 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v9 +; GCN-HSA-NEXT: v_bfe_i32 v14, v9, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[12:15] ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 @@ -4484,6 +4484,7 @@ ; GCN-HSA-NEXT: v_bfe_i32 v13, v11, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v11, v10, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[11:14] ; GCN-HSA-NEXT: s_waitcnt vmcnt(8) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v4 @@ -4492,34 +4493,32 @@ ; GCN-HSA-NEXT: v_bfe_i32 v8, v4, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 16, v7 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 +; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v6 ; GCN-HSA-NEXT: v_bfe_i32 v14, v7, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v12, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[8:11] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v3 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v2 ; GCN-HSA-NEXT: v_bfe_i32 v10, v3, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xb0 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x90 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[8:11] @@ -4528,26 +4527,26 @@ ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v16 ; GCN-HSA-NEXT: v_bfe_i32 v2, v17, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v16, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, s10 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v19 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v18 ; GCN-HSA-NEXT: v_bfe_i32 v2, v19, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v18, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, s9 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(12) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v23 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v22 @@ -4561,37 +4560,39 @@ ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 16, v28 ; GCN-HSA-NEXT: v_bfe_i32 v14, v29, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v12, v28, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v31 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v30 -; GCN-HSA-NEXT: v_bfe_i32 v10, v31, 0, 16 -; GCN-HSA-NEXT: v_bfe_i32 v8, v30, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v21 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v20 ; GCN-HSA-NEXT: v_bfe_i32 v2, v21, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v20, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 16, v31 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 16, v30 +; GCN-HSA-NEXT: v_bfe_i32 v10, v31, 0, 16 +; GCN-HSA-NEXT: v_bfe_i32 v8, v30, 0, 16 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] -; GCN-HSA-NEXT: s_waitcnt vmcnt(14) +; GCN-HSA-NEXT: s_waitcnt vmcnt(13) ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 16, v33 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 16, v32 ; GCN-HSA-NEXT: v_bfe_i32 v6, v33, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v4, v32, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 16, v35 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 16, v34 ; GCN-HSA-NEXT: v_bfe_i32 v2, v35, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v0, v34, 0, 16 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm @@ -5779,14 +5780,14 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: flat_load_dwordx2 v[8:9], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v9 @@ -5935,8 +5936,8 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v2 @@ -6107,15 +6108,15 @@ ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, v12 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, 0 @@ -6127,12 +6128,12 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v4, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v12 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v1 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 -; GCN-HSA-NEXT: v_and_b32_e32 v3, s4, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v12, s4, v1 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v2 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 +; GCN-HSA-NEXT: v_and_b32_e32 v3, s4, v0 ; GCN-HSA-NEXT: v_and_b32_e32 v7, s4, v2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[12:15] ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[7:10] @@ -6324,15 +6325,15 @@ ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 @@ -6577,10 +6578,10 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v8 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[4:5] @@ -6588,45 +6589,43 @@ ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 ; GCN-HSA-NEXT: s_waitcnt vmcnt(0) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v5 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v5 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] -; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v1 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v1 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s5 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v3 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s6, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 -; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[9:12] +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v9, 16, v7 ; GCN-HSA-NEXT: v_and_b32_e32 v7, s6, v7 -; GCN-HSA-NEXT: s_add_u32 s2, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v6 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[7:10] ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[11:14] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s0 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 0x60 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v8 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v7, v8 @@ -6634,12 +6633,14 @@ ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v13, 16, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v11, s6, v2 ; GCN-HSA-NEXT: v_and_b32_e32 v6, s6, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v4 ; GCN-HSA-NEXT: v_and_b32_e32 v0, s6, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3] ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[6:9] @@ -6949,8 +6950,8 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 @@ -6958,17 +6959,17 @@ ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 48 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x70 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v13, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s0 @@ -6976,22 +6977,22 @@ ; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[4:5], 48 ; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 +; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[8:11] ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 32 ; GCN-HSA-NEXT: v_bfe_i32 v8, v5, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[10:11], v[6:7], 48 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v6 -; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: v_bfe_i32 v7, v5, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 ; GCN-HSA-NEXT: v_bfe_i32 v5, v6, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[14:15], v[8:11] ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v6, 31, v5 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v8, 31, v7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[5:8] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 @@ -7000,16 +7001,15 @@ ; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[4:7] -; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[0:1], 48 ; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, v3 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v8, 16, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 -; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v1, 16, v0 ; GCN-HSA-NEXT: v_bfe_i32 v4, v0, 0, 16 @@ -7017,11 +7017,12 @@ ; GCN-HSA-NEXT: v_bfe_i32 v8, v2, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[2:3], v[2:3], 48 ; GCN-HSA-NEXT: v_bfe_i32 v0, v11, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 +; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v1, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 @@ -7437,17 +7438,17 @@ ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s4 ; GCN-HSA-NEXT: s_add_u32 s4, s2, 32 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s5 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v11, s5 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s4 ; GCN-HSA-NEXT: flat_load_dwordx4 v[6:9], v[6:7] ; GCN-HSA-NEXT: flat_load_dwordx4 v[10:13], v[10:11] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s2, 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s2 @@ -7467,9 +7468,9 @@ ; GCN-HSA-NEXT: s_addc_u32 s13, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x70 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s14 ; GCN-HSA-NEXT: s_add_u32 s14, s0, 0x50 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s15 ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 ; GCN-HSA-NEXT: s_waitcnt vmcnt(3) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v9 @@ -7491,87 +7492,87 @@ ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v11 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s12 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 +; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 +; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: s_waitcnt vmcnt(4) ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v19 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v19 -; GCN-HSA-NEXT: v_mov_b32_e32 v22, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v23, s5 +; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v17 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v17 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xe0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v23, s9 -; GCN-HSA-NEXT: s_add_u32 s4, s0, 32 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v15 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v15 ; GCN-HSA-NEXT: v_mov_b32_e32 v22, s8 -; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] ; GCN-HSA-NEXT: v_mov_b32_e32 v7, 0 +; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: flat_store_dwordx4 v[22:23], v[2:5] +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v18 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v18 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s1 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xe0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, v7 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s0 -; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s6 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[17:18], v[2:5] -; GCN-HSA-NEXT: v_and_b32_e32 v0, s16, v21 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v16 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v16 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xc0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xa0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v14 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v14 ; GCN-HSA-NEXT: v_mov_b32_e32 v14, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0xa0 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 +; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x80 ; GCN-HSA-NEXT: flat_store_dwordx4 v[13:14], v[2:5] -; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v12 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v12 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v11, s6 -; GCN-HSA-NEXT: s_add_u32 s6, s0, 0x80 ; GCN-HSA-NEXT: s_addc_u32 s7, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[11:12], v[2:5] -; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 +; GCN-HSA-NEXT: v_mov_b32_e32 v15, s2 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v4, 16, v10 ; GCN-HSA-NEXT: v_and_b32_e32 v2, s16, v10 ; GCN-HSA-NEXT: v_mov_b32_e32 v10, s7 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s6 +; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 ; GCN-HSA-NEXT: flat_store_dwordx4 v[9:10], v[2:5] -; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6 +; GCN-HSA-NEXT: v_and_b32_e32 v0, s16, v21 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v2, 16, v21 -; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3] +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 +; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v6 ; GCN-HSA-NEXT: v_and_b32_e32 v9, s16, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v20 ; GCN-HSA-NEXT: v_and_b32_e32 v4, s16, v20 +; GCN-HSA-NEXT: flat_store_dwordx4 v[15:16], v[0:3] ; GCN-HSA-NEXT: v_mov_b32_e32 v5, v1 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 -; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v14, 16, v8 ; GCN-HSA-NEXT: v_and_b32_e32 v12, s16, v8 +; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[4:7] ; GCN-HSA-NEXT: v_mov_b32_e32 v13, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v15, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s3 -; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[2:3], v[12:15] -; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 +; GCN-HSA-NEXT: v_mov_b32_e32 v10, v1 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, v7 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s0 +; GCN-HSA-NEXT: v_mov_b32_e32 v1, s1 ; GCN-HSA-NEXT: flat_store_dwordx4 v[0:1], v[9:12] ; GCN-HSA-NEXT: s_endpgm ; @@ -8130,10 +8131,10 @@ ; GCN-HSA-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 ; GCN-HSA-NEXT: s_waitcnt lgkmcnt(0) ; GCN-HSA-NEXT: s_add_u32 s4, s2, 48 -; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: s_addc_u32 s5, s3, 0 -; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v0, s2 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v2, s4 ; GCN-HSA-NEXT: v_mov_b32_e32 v3, s5 ; GCN-HSA-NEXT: flat_load_dwordx4 v[4:7], v[0:1] ; GCN-HSA-NEXT: flat_load_dwordx4 v[0:3], v[2:3] @@ -8142,8 +8143,8 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s2, 16 ; GCN-HSA-NEXT: s_addc_u32 s3, s3, 0 ; GCN-HSA-NEXT: v_mov_b32_e32 v9, s5 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v8, s4 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: flat_load_dwordx4 v[8:11], v[8:9] ; GCN-HSA-NEXT: flat_load_dwordx4 v[12:15], v[12:13] @@ -8158,9 +8159,9 @@ ; GCN-HSA-NEXT: v_bfe_i32 v16, v5, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: flat_store_dwordx4 v[20:21], v[16:19] -; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v20, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xf0 +; GCN-HSA-NEXT: v_mov_b32_e32 v21, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: s_add_u32 s4, s0, 0xd0 ; GCN-HSA-NEXT: s_addc_u32 s5, s1, 0 @@ -8182,15 +8183,15 @@ ; GCN-HSA-NEXT: s_addc_u32 s15, s1, 0 ; GCN-HSA-NEXT: v_bfe_i32 v18, v5, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v16, v6, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s14 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v17, 31, v16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v19, 31, v18 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s14 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s15 ; GCN-HSA-NEXT: flat_store_dwordx4 v[5:6], v[16:19] ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v4 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_bfe_i32 v4, v4, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v6, v5, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 @@ -8202,25 +8203,25 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s4 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v3 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s3 ; GCN-HSA-NEXT: v_bfe_i32 v4, v1, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[6:7], v[2:3], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v18, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[4:7] -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 ; GCN-HSA-NEXT: s_waitcnt vmcnt(7) -; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[8:9], 48 ; GCN-HSA-NEXT: v_bfe_i32 v3, v9, 0, 16 +; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[8:9], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s9 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s8 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v11 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] -; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s7 ; GCN-HSA-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[10:11], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v16, s6 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] ; GCN-HSA-NEXT: v_mov_b32_e32 v17, s13 @@ -8230,11 +8231,11 @@ ; GCN-HSA-NEXT: v_mov_b32_e32 v16, s12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 ; GCN-HSA-NEXT: v_mov_b32_e32 v1, v15 -; GCN-HSA-NEXT: v_mov_b32_e32 v19, s11 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[3:6] ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xe0 ; GCN-HSA-NEXT: v_bfe_i32 v3, v1, 0, 16 ; GCN-HSA-NEXT: v_ashr_i64 v[5:6], v[14:15], 48 +; GCN-HSA-NEXT: v_mov_b32_e32 v19, s11 ; GCN-HSA-NEXT: v_mov_b32_e32 v18, s10 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v7, 16, v2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 @@ -8242,24 +8243,23 @@ ; GCN-HSA-NEXT: flat_store_dwordx4 v[18:19], v[3:6] ; GCN-HSA-NEXT: v_bfe_i32 v1, v2, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v3, v7, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v6, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xc0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v5, 16, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v2, 31, v1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GCN-HSA-NEXT: v_mov_b32_e32 v7, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[6:7], v[1:4] ; GCN-HSA-NEXT: v_bfe_i32 v0, v0, 0, 16 ; GCN-HSA-NEXT: v_bfe_i32 v2, v5, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0xa0 -; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v15, 16, v10 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 -; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s3 +; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v6, 16, v14 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: v_bfe_i32 v4, v14, 0, 16 @@ -8271,30 +8271,32 @@ ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x80 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v13, 31, v12 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v15, 31, v14 +; GCN-HSA-NEXT: v_mov_b32_e32 v17, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_lshrrev_b32_e32 v11, 16, v8 ; GCN-HSA-NEXT: flat_store_dwordx4 v[16:17], v[12:15] ; GCN-HSA-NEXT: v_bfe_i32 v8, v8, 0, 16 -; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 -; GCN-HSA-NEXT: v_bfe_i32 v10, v11, 0, 16 ; GCN-HSA-NEXT: v_mov_b32_e32 v12, s2 ; GCN-HSA-NEXT: s_add_u32 s2, s0, 0x60 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 -; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 +; GCN-HSA-NEXT: v_bfe_i32 v10, v11, 0, 16 +; GCN-HSA-NEXT: v_mov_b32_e32 v13, s3 ; GCN-HSA-NEXT: s_addc_u32 s3, s1, 0 ; GCN-HSA-NEXT: v_bfe_i32 v6, v6, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v9, 31, v8 +; GCN-HSA-NEXT: v_ashrrev_i32_e32 v11, 31, v10 ; GCN-HSA-NEXT: s_add_u32 s0, s0, 64 -; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v5, 31, v4 +; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v7, 31, v6 -; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[12:13], v[8:11] ; GCN-HSA-NEXT: s_addc_u32 s1, s1, 0 -; GCN-HSA-NEXT: v_bfe_i32 v2, v2, 0, 16 -; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: v_mov_b32_e32 v9, s3 +; GCN-HSA-NEXT: v_mov_b32_e32 v8, s2 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v1, 31, v0 -; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_ashrrev_i32_e32 v3, 31, v2 +; GCN-HSA-NEXT: flat_store_dwordx4 v[8:9], v[4:7] +; GCN-HSA-NEXT: s_nop 0 +; GCN-HSA-NEXT: v_mov_b32_e32 v5, s1 ; GCN-HSA-NEXT: v_mov_b32_e32 v4, s0 ; GCN-HSA-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-HSA-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -1402,8 +1402,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1417,8 +1417,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1432,8 +1432,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1446,8 +1446,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1487,8 +1487,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1504,8 +1504,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1523,8 +1523,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1541,8 +1541,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1587,8 +1587,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1603,8 +1603,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1620,8 +1620,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1636,8 +1636,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1680,8 +1680,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1698,8 +1698,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1719,8 +1719,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1739,8 +1739,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1788,8 +1788,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1806,8 +1806,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1827,8 +1827,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1847,8 +1847,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1896,8 +1896,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1913,8 +1913,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1932,8 +1932,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1950,8 +1950,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1996,8 +1996,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2013,8 +2013,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2032,8 +2032,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2050,8 +2050,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2096,8 +2096,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2114,8 +2114,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2135,8 +2135,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2155,8 +2155,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2204,8 +2204,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2222,8 +2222,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2243,8 +2243,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2263,8 +2263,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2312,8 +2312,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2330,8 +2330,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2351,8 +2351,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2371,8 +2371,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2420,8 +2420,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2438,8 +2438,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2459,8 +2459,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2479,8 +2479,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2528,8 +2528,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2546,8 +2546,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2567,8 +2567,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2587,8 +2587,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2636,8 +2636,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2654,8 +2654,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2675,8 +2675,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2695,8 +2695,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2744,8 +2744,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2762,8 +2762,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2783,8 +2783,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2803,8 +2803,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2852,8 +2852,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2870,8 +2870,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2891,8 +2891,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2911,8 +2911,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2960,8 +2960,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2979,8 +2979,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2998,8 +2998,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3016,8 +3016,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3067,8 +3067,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3087,8 +3087,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3108,8 +3108,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3128,8 +3128,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3181,8 +3181,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3201,8 +3201,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3222,8 +3222,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3242,8 +3242,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3296,8 +3296,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3317,8 +3317,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3340,8 +3340,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3362,8 +3362,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3418,8 +3418,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3439,8 +3439,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3462,8 +3462,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3484,8 +3484,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3540,8 +3540,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3560,8 +3560,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3581,8 +3581,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3601,8 +3601,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3654,8 +3654,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3674,8 +3674,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3695,8 +3695,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3715,8 +3715,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3768,8 +3768,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3789,8 +3789,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3812,8 +3812,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3834,8 +3834,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3890,8 +3890,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3911,8 +3911,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3934,8 +3934,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3956,8 +3956,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4012,8 +4012,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4033,8 +4033,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4056,8 +4056,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4078,8 +4078,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4134,8 +4134,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4155,8 +4155,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4178,8 +4178,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4200,8 +4200,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4256,8 +4256,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4277,8 +4277,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4300,8 +4300,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4322,8 +4322,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4378,8 +4378,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4399,8 +4399,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4422,8 +4422,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4444,8 +4444,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4500,8 +4500,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4521,8 +4521,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4544,8 +4544,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4566,8 +4566,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4622,8 +4622,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4643,8 +4643,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4666,8 +4666,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4688,8 +4688,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -6147,8 +6147,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6162,8 +6162,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6177,8 +6177,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6191,8 +6191,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6232,8 +6232,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6249,8 +6249,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6267,8 +6267,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6284,8 +6284,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6330,8 +6330,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6346,8 +6346,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6363,8 +6363,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6379,8 +6379,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6423,8 +6423,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6441,8 +6441,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6461,8 +6461,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6480,8 +6480,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6529,8 +6529,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6547,8 +6547,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6567,8 +6567,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6586,8 +6586,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6635,8 +6635,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6652,8 +6652,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6670,8 +6670,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6687,8 +6687,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6733,8 +6733,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6750,8 +6750,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6768,8 +6768,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6785,8 +6785,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6831,8 +6831,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6849,8 +6849,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6869,8 +6869,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6888,8 +6888,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6937,8 +6937,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6955,8 +6955,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6975,8 +6975,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6994,8 +6994,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7043,8 +7043,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7061,8 +7061,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7081,8 +7081,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7100,8 +7100,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7149,8 +7149,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7167,8 +7167,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7187,8 +7187,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7206,8 +7206,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7255,8 +7255,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7273,8 +7273,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7293,8 +7293,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7312,8 +7312,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7361,8 +7361,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7379,8 +7379,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7399,8 +7399,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7418,8 +7418,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7467,8 +7467,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7485,8 +7485,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7505,8 +7505,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7524,8 +7524,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7573,8 +7573,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7591,8 +7591,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7611,8 +7611,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7630,8 +7630,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7679,8 +7679,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7698,8 +7698,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7717,8 +7717,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7735,8 +7735,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7786,8 +7786,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7807,8 +7807,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -7829,8 +7829,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -7850,8 +7850,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7905,8 +7905,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -7925,8 +7925,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7946,8 +7946,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7966,8 +7966,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8020,8 +8020,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8042,8 +8042,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8066,8 +8066,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8089,8 +8089,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8147,8 +8147,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8169,8 +8169,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8193,8 +8193,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8216,8 +8216,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8274,8 +8274,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8295,8 +8295,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -8317,8 +8317,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -8338,8 +8338,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8393,8 +8393,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8414,8 +8414,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -8436,8 +8436,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -8457,8 +8457,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8512,8 +8512,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8534,8 +8534,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8558,8 +8558,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8581,8 +8581,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8639,8 +8639,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8661,8 +8661,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8685,8 +8685,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8708,8 +8708,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8766,8 +8766,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8788,8 +8788,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8812,8 +8812,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8835,8 +8835,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8893,8 +8893,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8915,8 +8915,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8939,8 +8939,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8962,8 +8962,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9020,8 +9020,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9042,8 +9042,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9066,8 +9066,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9089,8 +9089,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9147,8 +9147,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9169,8 +9169,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9193,8 +9193,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9216,8 +9216,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9274,8 +9274,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9296,8 +9296,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9320,8 +9320,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9343,8 +9343,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9401,8 +9401,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9423,8 +9423,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9447,8 +9447,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9470,8 +9470,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -1258,8 +1258,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1273,8 +1273,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1288,8 +1288,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1302,8 +1302,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1343,8 +1343,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1358,8 +1358,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1373,8 +1373,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1387,8 +1387,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1428,8 +1428,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1443,8 +1443,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1458,8 +1458,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1472,8 +1472,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1513,8 +1513,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1528,8 +1528,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1543,8 +1543,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1557,8 +1557,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1598,8 +1598,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1613,8 +1613,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1628,8 +1628,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1642,8 +1642,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1683,8 +1683,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1698,8 +1698,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1713,8 +1713,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1727,8 +1727,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1768,8 +1768,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1783,8 +1783,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1798,8 +1798,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1812,8 +1812,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1853,8 +1853,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1868,8 +1868,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1883,8 +1883,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1897,8 +1897,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1938,8 +1938,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1953,8 +1953,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1968,8 +1968,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1982,8 +1982,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2023,8 +2023,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2038,8 +2038,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2053,8 +2053,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2067,8 +2067,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2108,8 +2108,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2123,8 +2123,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2138,8 +2138,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2152,8 +2152,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2193,8 +2193,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2208,8 +2208,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2223,8 +2223,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2237,8 +2237,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2278,8 +2278,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2293,8 +2293,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2308,8 +2308,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2322,8 +2322,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2363,8 +2363,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2378,8 +2378,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2393,8 +2393,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2407,8 +2407,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2448,8 +2448,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2463,8 +2463,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2478,8 +2478,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2492,8 +2492,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2533,8 +2533,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2552,8 +2552,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2571,8 +2571,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2589,8 +2589,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2640,8 +2640,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2659,8 +2659,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2678,8 +2678,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2696,8 +2696,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2747,8 +2747,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2766,8 +2766,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2785,8 +2785,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2803,8 +2803,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2854,8 +2854,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2873,8 +2873,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2892,8 +2892,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2910,8 +2910,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2961,8 +2961,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2980,8 +2980,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2999,8 +2999,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3017,8 +3017,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3068,8 +3068,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3087,8 +3087,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3106,8 +3106,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3124,8 +3124,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3175,8 +3175,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3194,8 +3194,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3213,8 +3213,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3231,8 +3231,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3282,8 +3282,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3301,8 +3301,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3320,8 +3320,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3338,8 +3338,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3389,8 +3389,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3408,8 +3408,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3427,8 +3427,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3445,8 +3445,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3496,8 +3496,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3515,8 +3515,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3534,8 +3534,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3552,8 +3552,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3603,8 +3603,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3622,8 +3622,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3641,8 +3641,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3659,8 +3659,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3710,8 +3710,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3729,8 +3729,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3748,8 +3748,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3766,8 +3766,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3817,8 +3817,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3836,8 +3836,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3855,8 +3855,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3873,8 +3873,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3924,8 +3924,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3943,8 +3943,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3962,8 +3962,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3980,8 +3980,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -4031,8 +4031,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -4050,8 +4050,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -4069,8 +4069,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -4087,8 +4087,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -5381,8 +5381,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5396,8 +5396,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5411,8 +5411,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5425,8 +5425,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5466,8 +5466,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5481,8 +5481,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5496,8 +5496,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5510,8 +5510,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5551,8 +5551,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5566,8 +5566,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5581,8 +5581,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5595,8 +5595,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5636,8 +5636,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5651,8 +5651,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5666,8 +5666,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5680,8 +5680,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5721,8 +5721,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5736,8 +5736,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5751,8 +5751,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5765,8 +5765,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5806,8 +5806,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5821,8 +5821,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5836,8 +5836,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5850,8 +5850,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5891,8 +5891,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5906,8 +5906,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5921,8 +5921,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5935,8 +5935,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5976,8 +5976,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5991,8 +5991,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6006,8 +6006,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6020,8 +6020,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6061,8 +6061,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6076,8 +6076,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6091,8 +6091,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6105,8 +6105,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6146,8 +6146,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6161,8 +6161,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6176,8 +6176,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6190,8 +6190,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6231,8 +6231,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6246,8 +6246,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6261,8 +6261,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6275,8 +6275,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6316,8 +6316,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6331,8 +6331,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6346,8 +6346,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6360,8 +6360,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6401,8 +6401,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6416,8 +6416,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6431,8 +6431,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6445,8 +6445,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6486,8 +6486,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6501,8 +6501,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6516,8 +6516,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6530,8 +6530,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6571,8 +6571,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6586,8 +6586,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6601,8 +6601,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6615,8 +6615,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6656,8 +6656,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -6675,8 +6675,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -6694,8 +6694,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6712,8 +6712,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -6763,8 +6763,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -6782,8 +6782,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -6801,8 +6801,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6819,8 +6819,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -6870,8 +6870,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -6889,8 +6889,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -6908,8 +6908,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6926,8 +6926,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -6977,8 +6977,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -6996,8 +6996,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7015,8 +7015,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7033,8 +7033,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7084,8 +7084,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7103,8 +7103,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7122,8 +7122,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7140,8 +7140,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7191,8 +7191,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7210,8 +7210,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7229,8 +7229,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7247,8 +7247,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7298,8 +7298,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7317,8 +7317,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7336,8 +7336,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7354,8 +7354,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7405,8 +7405,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7424,8 +7424,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7443,8 +7443,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7461,8 +7461,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7512,8 +7512,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7531,8 +7531,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7550,8 +7550,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7568,8 +7568,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7619,8 +7619,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7638,8 +7638,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7657,8 +7657,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7675,8 +7675,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7726,8 +7726,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7745,8 +7745,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7764,8 +7764,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7782,8 +7782,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7833,8 +7833,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7852,8 +7852,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7871,8 +7871,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7889,8 +7889,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7940,8 +7940,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7959,8 +7959,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7978,8 +7978,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7996,8 +7996,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -8047,8 +8047,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8066,8 +8066,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -8085,8 +8085,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8103,8 +8103,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -8154,8 +8154,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8173,8 +8173,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -8192,8 +8192,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8210,8 +8210,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -1432,8 +1432,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1447,8 +1447,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1462,8 +1462,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1476,8 +1476,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1517,8 +1517,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1534,8 +1534,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1553,8 +1553,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1571,8 +1571,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1619,8 +1619,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1635,8 +1635,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1652,8 +1652,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1668,8 +1668,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1714,8 +1714,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1732,8 +1732,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1753,8 +1753,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1773,8 +1773,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1826,8 +1826,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1844,8 +1844,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1865,8 +1865,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1885,8 +1885,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1938,8 +1938,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1955,8 +1955,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1974,8 +1974,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1992,8 +1992,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2040,8 +2040,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2057,8 +2057,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2076,8 +2076,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2094,8 +2094,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2142,8 +2142,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2160,8 +2160,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2181,8 +2181,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2201,8 +2201,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2254,8 +2254,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2272,8 +2272,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2293,8 +2293,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2313,8 +2313,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2366,8 +2366,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2384,8 +2384,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2405,8 +2405,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2425,8 +2425,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2478,8 +2478,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2496,8 +2496,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2517,8 +2517,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2537,8 +2537,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2590,8 +2590,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2608,8 +2608,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2629,8 +2629,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2649,8 +2649,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2702,8 +2702,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2720,8 +2720,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2741,8 +2741,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2761,8 +2761,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2814,8 +2814,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2832,8 +2832,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2853,8 +2853,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2873,8 +2873,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2926,8 +2926,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2944,8 +2944,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2965,8 +2965,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2985,8 +2985,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -3038,8 +3038,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3057,8 +3057,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3076,8 +3076,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3094,8 +3094,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3145,8 +3145,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3165,8 +3165,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3186,8 +3186,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3206,8 +3206,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3261,8 +3261,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3281,8 +3281,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3302,8 +3302,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3322,8 +3322,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3378,8 +3378,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3399,8 +3399,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3422,8 +3422,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3444,8 +3444,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3504,8 +3504,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3525,8 +3525,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3548,8 +3548,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3570,8 +3570,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3630,8 +3630,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3650,8 +3650,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3671,8 +3671,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3691,8 +3691,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3746,8 +3746,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3766,8 +3766,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3787,8 +3787,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3807,8 +3807,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3862,8 +3862,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3883,8 +3883,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3906,8 +3906,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3928,8 +3928,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3988,8 +3988,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4009,8 +4009,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4032,8 +4032,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4054,8 +4054,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4114,8 +4114,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4135,8 +4135,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4158,8 +4158,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4180,8 +4180,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4240,8 +4240,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4261,8 +4261,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4284,8 +4284,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4306,8 +4306,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4366,8 +4366,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4387,8 +4387,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4410,8 +4410,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4432,8 +4432,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4492,8 +4492,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4513,8 +4513,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4536,8 +4536,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4558,8 +4558,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4618,8 +4618,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4639,8 +4639,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4662,8 +4662,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4684,8 +4684,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4744,8 +4744,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4765,8 +4765,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4788,8 +4788,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4810,8 +4810,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -6303,8 +6303,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6318,8 +6318,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6333,8 +6333,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6347,8 +6347,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6388,8 +6388,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6405,8 +6405,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6423,8 +6423,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6440,8 +6440,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6488,8 +6488,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6504,8 +6504,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6521,8 +6521,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6537,8 +6537,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6583,8 +6583,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6601,8 +6601,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6621,8 +6621,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6640,8 +6640,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6693,8 +6693,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6711,8 +6711,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6731,8 +6731,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6750,8 +6750,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6803,8 +6803,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6820,8 +6820,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6838,8 +6838,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6855,8 +6855,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -6903,8 +6903,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6920,8 +6920,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6938,8 +6938,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6955,8 +6955,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -7003,8 +7003,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7021,8 +7021,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7041,8 +7041,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7060,8 +7060,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7113,8 +7113,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7131,8 +7131,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7151,8 +7151,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7170,8 +7170,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7223,8 +7223,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7241,8 +7241,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7261,8 +7261,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7280,8 +7280,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7333,8 +7333,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7351,8 +7351,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7371,8 +7371,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7390,8 +7390,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7443,8 +7443,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7461,8 +7461,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7481,8 +7481,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7500,8 +7500,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7553,8 +7553,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7571,8 +7571,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7591,8 +7591,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7610,8 +7610,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7663,8 +7663,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7681,8 +7681,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7701,8 +7701,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7720,8 +7720,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7773,8 +7773,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7791,8 +7791,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7811,8 +7811,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7830,8 +7830,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7883,8 +7883,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7902,8 +7902,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7921,8 +7921,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7939,8 +7939,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7990,8 +7990,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8011,8 +8011,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -8033,8 +8033,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -8054,8 +8054,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8111,8 +8111,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8131,8 +8131,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8152,8 +8152,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8172,8 +8172,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8228,8 +8228,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8250,8 +8250,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8274,8 +8274,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8297,8 +8297,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8359,8 +8359,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8381,8 +8381,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8405,8 +8405,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8428,8 +8428,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8490,8 +8490,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8511,8 +8511,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -8533,8 +8533,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -8554,8 +8554,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8611,8 +8611,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8632,8 +8632,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -8654,8 +8654,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) @@ -8675,8 +8675,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) @@ -8732,8 +8732,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8754,8 +8754,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8778,8 +8778,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8801,8 +8801,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8863,8 +8863,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8885,8 +8885,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8909,8 +8909,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8932,8 +8932,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8994,8 +8994,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9016,8 +9016,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9040,8 +9040,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9063,8 +9063,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9125,8 +9125,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9147,8 +9147,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9171,8 +9171,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9194,8 +9194,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9256,8 +9256,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9278,8 +9278,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9302,8 +9302,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9325,8 +9325,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9387,8 +9387,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9409,8 +9409,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9433,8 +9433,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9456,8 +9456,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9518,8 +9518,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9540,8 +9540,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9564,8 +9564,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9587,8 +9587,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9649,8 +9649,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9671,8 +9671,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9695,8 +9695,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9718,8 +9718,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt vmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -1258,8 +1258,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1273,8 +1273,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1288,8 +1288,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1302,8 +1302,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1343,8 +1343,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1358,8 +1358,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1373,8 +1373,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1387,8 +1387,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1428,8 +1428,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1443,8 +1443,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1458,8 +1458,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1472,8 +1472,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1513,8 +1513,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1528,8 +1528,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1543,8 +1543,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1557,8 +1557,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1598,8 +1598,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1613,8 +1613,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1628,8 +1628,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1642,8 +1642,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1683,8 +1683,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1698,8 +1698,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1713,8 +1713,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1727,8 +1727,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1768,8 +1768,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1783,8 +1783,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1798,8 +1798,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1812,8 +1812,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1853,8 +1853,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1868,8 +1868,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1883,8 +1883,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1897,8 +1897,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1938,8 +1938,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1953,8 +1953,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1968,8 +1968,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1982,8 +1982,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2023,8 +2023,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2038,8 +2038,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2053,8 +2053,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2067,8 +2067,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2108,8 +2108,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2123,8 +2123,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2138,8 +2138,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2152,8 +2152,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2193,8 +2193,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2208,8 +2208,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2223,8 +2223,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2237,8 +2237,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2278,8 +2278,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2293,8 +2293,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2308,8 +2308,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2322,8 +2322,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2363,8 +2363,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2378,8 +2378,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2393,8 +2393,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2407,8 +2407,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2448,8 +2448,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2463,8 +2463,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -2478,8 +2478,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -2492,8 +2492,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -2533,8 +2533,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2552,8 +2552,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2571,8 +2571,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2589,8 +2589,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2640,8 +2640,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2659,8 +2659,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2678,8 +2678,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2696,8 +2696,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2747,8 +2747,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2766,8 +2766,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2785,8 +2785,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2803,8 +2803,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2854,8 +2854,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2873,8 +2873,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2892,8 +2892,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2910,8 +2910,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2961,8 +2961,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2980,8 +2980,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2999,8 +2999,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3017,8 +3017,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3068,8 +3068,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3087,8 +3087,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3106,8 +3106,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3124,8 +3124,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3175,8 +3175,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3194,8 +3194,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3213,8 +3213,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3231,8 +3231,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3282,8 +3282,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3301,8 +3301,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3320,8 +3320,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3338,8 +3338,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3389,8 +3389,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3408,8 +3408,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3427,8 +3427,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3445,8 +3445,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3496,8 +3496,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3515,8 +3515,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3534,8 +3534,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3552,8 +3552,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3603,8 +3603,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3622,8 +3622,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3641,8 +3641,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3659,8 +3659,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3710,8 +3710,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3729,8 +3729,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3748,8 +3748,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3766,8 +3766,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3817,8 +3817,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3836,8 +3836,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3855,8 +3855,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3873,8 +3873,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -3924,8 +3924,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3943,8 +3943,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -3962,8 +3962,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -3980,8 +3980,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -4031,8 +4031,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -4050,8 +4050,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -4069,8 +4069,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -4087,8 +4087,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -5381,8 +5381,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5396,8 +5396,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5411,8 +5411,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5425,8 +5425,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5466,8 +5466,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5481,8 +5481,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5496,8 +5496,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5510,8 +5510,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5551,8 +5551,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5566,8 +5566,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5581,8 +5581,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5595,8 +5595,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5636,8 +5636,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5651,8 +5651,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5666,8 +5666,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5680,8 +5680,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5721,8 +5721,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5736,8 +5736,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5751,8 +5751,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5765,8 +5765,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5806,8 +5806,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5821,8 +5821,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5836,8 +5836,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5850,8 +5850,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5891,8 +5891,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5906,8 +5906,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5921,8 +5921,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5935,8 +5935,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5976,8 +5976,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5991,8 +5991,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6006,8 +6006,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6020,8 +6020,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6061,8 +6061,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6076,8 +6076,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6091,8 +6091,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6105,8 +6105,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6146,8 +6146,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6161,8 +6161,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6176,8 +6176,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6190,8 +6190,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6231,8 +6231,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6246,8 +6246,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6261,8 +6261,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6275,8 +6275,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6316,8 +6316,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6331,8 +6331,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6346,8 +6346,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6360,8 +6360,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6401,8 +6401,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6416,8 +6416,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6431,8 +6431,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6445,8 +6445,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6486,8 +6486,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6501,8 +6501,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6516,8 +6516,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6530,8 +6530,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6571,8 +6571,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6586,8 +6586,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -6601,8 +6601,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6615,8 +6615,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6656,8 +6656,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -6675,8 +6675,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -6694,8 +6694,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6712,8 +6712,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -6763,8 +6763,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -6782,8 +6782,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -6801,8 +6801,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6819,8 +6819,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -6870,8 +6870,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -6889,8 +6889,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -6908,8 +6908,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6926,8 +6926,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -6977,8 +6977,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -6996,8 +6996,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7015,8 +7015,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7033,8 +7033,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7084,8 +7084,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7103,8 +7103,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7122,8 +7122,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7140,8 +7140,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7191,8 +7191,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7210,8 +7210,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7229,8 +7229,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7247,8 +7247,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7298,8 +7298,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7317,8 +7317,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7336,8 +7336,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7354,8 +7354,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7405,8 +7405,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7424,8 +7424,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7443,8 +7443,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7461,8 +7461,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7512,8 +7512,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7531,8 +7531,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7550,8 +7550,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7568,8 +7568,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7619,8 +7619,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7638,8 +7638,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7657,8 +7657,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7675,8 +7675,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7726,8 +7726,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7745,8 +7745,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7764,8 +7764,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7782,8 +7782,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7833,8 +7833,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7852,8 +7852,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7871,8 +7871,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7889,8 +7889,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7940,8 +7940,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7959,8 +7959,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -7978,8 +7978,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7996,8 +7996,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -8047,8 +8047,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8066,8 +8066,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -8085,8 +8085,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8103,8 +8103,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -1359,8 +1359,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1374,8 +1374,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -1389,8 +1389,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -1403,8 +1403,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -1444,8 +1444,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1460,8 +1460,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1478,8 +1478,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1493,8 +1493,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1538,8 +1538,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1554,8 +1554,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1571,8 +1571,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1586,8 +1586,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1630,8 +1630,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1647,8 +1647,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1667,8 +1667,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1683,8 +1683,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1731,8 +1731,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1748,8 +1748,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1768,8 +1768,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1784,8 +1784,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1832,8 +1832,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1848,8 +1848,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1866,8 +1866,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1881,8 +1881,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -1926,8 +1926,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -1942,8 +1942,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1960,8 +1960,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1975,8 +1975,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2020,8 +2020,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2037,8 +2037,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2057,8 +2057,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2073,8 +2073,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2121,8 +2121,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2138,8 +2138,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2158,8 +2158,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2174,8 +2174,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2222,8 +2222,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2239,8 +2239,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2259,8 +2259,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2275,8 +2275,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2323,8 +2323,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2340,8 +2340,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2360,8 +2360,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2376,8 +2376,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2424,8 +2424,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2443,8 +2443,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -2462,8 +2462,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -2480,8 +2480,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -2531,8 +2531,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -2551,8 +2551,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2571,8 +2571,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2590,8 +2590,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -2643,8 +2643,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -2663,8 +2663,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2684,8 +2684,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -2703,8 +2703,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -2757,8 +2757,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -2778,8 +2778,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2800,8 +2800,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -2820,8 +2820,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -2876,8 +2876,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -2897,8 +2897,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2919,8 +2919,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -2939,8 +2939,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -2995,8 +2995,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3015,8 +3015,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3035,8 +3035,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3054,8 +3054,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3107,8 +3107,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -3127,8 +3127,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -3147,8 +3147,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3166,8 +3166,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) @@ -3219,8 +3219,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3240,8 +3240,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3262,8 +3262,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3282,8 +3282,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3338,8 +3338,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3359,8 +3359,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3381,8 +3381,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3401,8 +3401,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3457,8 +3457,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3478,8 +3478,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3500,8 +3500,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3520,8 +3520,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3576,8 +3576,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3597,8 +3597,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3619,8 +3619,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3639,8 +3639,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3695,8 +3695,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3716,8 +3716,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3738,8 +3738,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3758,8 +3758,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3814,8 +3814,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3835,8 +3835,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3857,8 +3857,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3877,8 +3877,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3933,8 +3933,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3954,8 +3954,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3976,8 +3976,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3996,8 +3996,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4052,8 +4052,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4073,8 +4073,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4095,8 +4095,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4115,8 +4115,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: s_waitcnt lgkmcnt(0) ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -5465,8 +5465,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5480,8 +5480,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_endpgm @@ -5495,8 +5495,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5509,8 +5509,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5550,8 +5550,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5565,8 +5565,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5582,8 +5582,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5596,8 +5596,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5639,8 +5639,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5654,8 +5654,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5671,8 +5671,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5685,8 +5685,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5727,8 +5727,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5742,8 +5742,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5761,8 +5761,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5775,8 +5775,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5819,8 +5819,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5834,8 +5834,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5853,8 +5853,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5867,8 +5867,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -5911,8 +5911,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5926,8 +5926,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5943,8 +5943,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -5957,8 +5957,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6000,8 +6000,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6015,8 +6015,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6032,8 +6032,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6046,8 +6046,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6089,8 +6089,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6104,8 +6104,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6123,8 +6123,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6137,8 +6137,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6181,8 +6181,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6196,8 +6196,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6215,8 +6215,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6229,8 +6229,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6273,8 +6273,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6288,8 +6288,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6307,8 +6307,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6321,8 +6321,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6365,8 +6365,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6380,8 +6380,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6399,8 +6399,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6413,8 +6413,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6457,8 +6457,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6472,8 +6472,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6491,8 +6491,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6505,8 +6505,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6549,8 +6549,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6564,8 +6564,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6583,8 +6583,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6597,8 +6597,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6641,8 +6641,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6656,8 +6656,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6675,8 +6675,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6689,8 +6689,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6733,8 +6733,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6748,8 +6748,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s0, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6767,8 +6767,8 @@ ; GFX10-CU-NEXT: s_add_u32 s0, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s1, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX10-CU-NEXT: s_endpgm @@ -6781,8 +6781,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s2, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s3, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s3 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; SKIP-CACHE-INV-NEXT: s_endpgm @@ -6825,8 +6825,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -6844,8 +6844,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 @@ -6863,8 +6863,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6881,8 +6881,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -6932,8 +6932,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -6951,8 +6951,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -6972,8 +6972,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -6990,8 +6990,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7042,8 +7042,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7061,8 +7061,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7082,8 +7082,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7100,8 +7100,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7152,8 +7152,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7171,8 +7171,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7194,8 +7194,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7212,8 +7212,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7265,8 +7265,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7284,8 +7284,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7307,8 +7307,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7325,8 +7325,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7378,8 +7378,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7397,8 +7397,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -7418,8 +7418,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7436,8 +7436,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7488,8 +7488,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7507,8 +7507,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) @@ -7528,8 +7528,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7546,8 +7546,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7598,8 +7598,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7617,8 +7617,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7640,8 +7640,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7658,8 +7658,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7711,8 +7711,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7730,8 +7730,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7753,8 +7753,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7771,8 +7771,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7824,8 +7824,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7843,8 +7843,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7866,8 +7866,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7884,8 +7884,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -7937,8 +7937,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7956,8 +7956,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7979,8 +7979,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -7997,8 +7997,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -8050,8 +8050,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8069,8 +8069,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8092,8 +8092,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8110,8 +8110,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -8163,8 +8163,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8182,8 +8182,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8205,8 +8205,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8223,8 +8223,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -8276,8 +8276,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8295,8 +8295,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8318,8 +8318,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8336,8 +8336,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 @@ -8389,8 +8389,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8408,8 +8408,8 @@ ; GFX10-WGP-NEXT: s_add_u32 s4, s0, 16 ; GFX10-WGP-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-WGP-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8431,8 +8431,8 @@ ; GFX10-CU-NEXT: s_add_u32 s4, s0, 16 ; GFX10-CU-NEXT: s_addc_u32 s5, s1, 0 ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX10-CU-NEXT: v_mov_b32_e32 v3, s3 ; GFX10-CU-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 @@ -8449,8 +8449,8 @@ ; SKIP-CACHE-INV-NEXT: s_add_u32 s4, s2, 16 ; SKIP-CACHE-INV-NEXT: s_addc_u32 s5, s3, 0 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s4 -; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v1, s5 +; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v3, s1 ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v2, s0 ; SKIP-CACHE-INV-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; SKIP-CACHE-INV-NEXT: v_mov_b32_e32 v0, s2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -1574,8 +1574,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1665,8 +1665,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1768,8 +1768,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1868,8 +1868,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1981,8 +1981,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2093,8 +2093,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2197,8 +2197,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2302,8 +2302,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2415,8 +2415,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2528,8 +2528,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2641,8 +2641,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2754,8 +2754,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2867,8 +2867,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2980,8 +2980,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -3093,8 +3093,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -3205,8 +3205,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3313,8 +3313,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3428,8 +3428,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3545,8 +3545,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3669,8 +3669,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3792,8 +3792,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3907,8 +3907,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -4023,8 +4023,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4147,8 +4147,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4271,8 +4271,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4395,8 +4395,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4519,8 +4519,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4643,8 +4643,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4767,8 +4767,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4891,8 +4891,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -6557,8 +6557,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6648,8 +6648,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6751,8 +6751,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6851,8 +6851,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6964,8 +6964,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7076,8 +7076,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7180,8 +7180,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -7285,8 +7285,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7398,8 +7398,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7511,8 +7511,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7624,8 +7624,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7737,8 +7737,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7850,8 +7850,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7963,8 +7963,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -8076,8 +8076,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -8188,8 +8188,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8296,8 +8296,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8412,8 +8412,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8536,8 +8536,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8659,8 +8659,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8774,8 +8774,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8890,8 +8890,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9014,8 +9014,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9138,8 +9138,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9262,8 +9262,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9386,8 +9386,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9510,8 +9510,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9634,8 +9634,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9758,8 +9758,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -1421,8 +1421,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1510,8 +1510,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1599,8 +1599,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1688,8 +1688,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1777,8 +1777,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1866,8 +1866,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1955,8 +1955,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2044,8 +2044,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2133,8 +2133,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2222,8 +2222,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2311,8 +2311,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2400,8 +2400,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2489,8 +2489,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2578,8 +2578,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2667,8 +2667,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2758,8 +2758,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2865,8 +2865,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2972,8 +2972,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3079,8 +3079,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3186,8 +3186,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3293,8 +3293,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3400,8 +3400,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3507,8 +3507,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3614,8 +3614,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3721,8 +3721,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3828,8 +3828,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3935,8 +3935,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -4042,8 +4042,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -4149,8 +4149,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -4256,8 +4256,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -5754,8 +5754,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5843,8 +5843,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5932,8 +5932,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6021,8 +6021,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6110,8 +6110,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6199,8 +6199,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6288,8 +6288,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6377,8 +6377,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6466,8 +6466,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6555,8 +6555,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6644,8 +6644,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6733,8 +6733,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6822,8 +6822,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6911,8 +6911,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -7000,8 +7000,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -7091,8 +7091,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7198,8 +7198,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7305,8 +7305,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7412,8 +7412,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7519,8 +7519,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7626,8 +7626,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7733,8 +7733,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7840,8 +7840,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7947,8 +7947,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8054,8 +8054,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8161,8 +8161,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8268,8 +8268,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8375,8 +8375,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8482,8 +8482,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8589,8 +8589,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -1604,8 +1604,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1695,8 +1695,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -1800,8 +1800,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1902,8 +1902,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2019,8 +2019,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2135,8 +2135,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2241,8 +2241,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -2348,8 +2348,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2465,8 +2465,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2582,8 +2582,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2699,8 +2699,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2815,8 +2815,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2923,8 +2923,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3041,8 +3041,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3169,8 +3169,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3296,8 +3296,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3413,8 +3413,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -3531,8 +3531,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3659,8 +3659,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3787,8 +3787,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3915,8 +3915,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4043,8 +4043,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4171,8 +4171,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4299,8 +4299,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4427,8 +4427,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -6127,8 +6127,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6218,8 +6218,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6323,8 +6323,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6425,8 +6425,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6542,8 +6542,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6658,8 +6658,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6764,8 +6764,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -6871,8 +6871,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -6988,8 +6988,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7105,8 +7105,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7222,8 +7222,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7339,8 +7339,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7456,8 +7456,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7573,8 +7573,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7690,8 +7690,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -7806,8 +7806,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7914,8 +7914,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8031,8 +8031,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8150,8 +8150,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8278,8 +8278,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8405,8 +8405,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8522,8 +8522,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: s_waitcnt vmcnt(0) @@ -8640,8 +8640,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8768,8 +8768,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -8896,8 +8896,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9024,8 +9024,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9152,8 +9152,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9280,8 +9280,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9408,8 +9408,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -9536,8 +9536,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -1421,8 +1421,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1510,8 +1510,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1599,8 +1599,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1688,8 +1688,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1777,8 +1777,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1866,8 +1866,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1955,8 +1955,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2044,8 +2044,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2133,8 +2133,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2222,8 +2222,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2311,8 +2311,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2400,8 +2400,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2489,8 +2489,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2578,8 +2578,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2667,8 +2667,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2758,8 +2758,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2865,8 +2865,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -2972,8 +2972,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3079,8 +3079,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3186,8 +3186,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3293,8 +3293,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3400,8 +3400,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3507,8 +3507,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3614,8 +3614,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3721,8 +3721,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3828,8 +3828,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3935,8 +3935,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -4042,8 +4042,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -4149,8 +4149,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -4256,8 +4256,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -5754,8 +5754,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5843,8 +5843,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -5932,8 +5932,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6021,8 +6021,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6110,8 +6110,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6199,8 +6199,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6288,8 +6288,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6377,8 +6377,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6466,8 +6466,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6555,8 +6555,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6644,8 +6644,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6733,8 +6733,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6822,8 +6822,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6911,8 +6911,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -7000,8 +7000,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -7091,8 +7091,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7198,8 +7198,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7305,8 +7305,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7412,8 +7412,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7519,8 +7519,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7626,8 +7626,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7733,8 +7733,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7840,8 +7840,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7947,8 +7947,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8054,8 +8054,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8161,8 +8161,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8268,8 +8268,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8375,8 +8375,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8482,8 +8482,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8589,8 +8589,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -1503,8 +1503,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1592,8 +1592,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -1686,8 +1686,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1783,8 +1783,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1884,8 +1884,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -1984,8 +1984,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2077,8 +2077,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -2171,8 +2171,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2272,8 +2272,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2373,8 +2373,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2474,8 +2474,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2575,8 +2575,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2676,8 +2676,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2777,8 +2777,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2878,8 +2878,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] @@ -2980,8 +2980,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3087,8 +3087,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3197,8 +3197,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3312,8 +3312,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3429,8 +3429,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3545,8 +3545,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3654,8 +3654,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -3764,8 +3764,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3881,8 +3881,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -3998,8 +3998,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4115,8 +4115,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4232,8 +4232,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4349,8 +4349,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4466,8 +4466,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -4583,8 +4583,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc @@ -6134,8 +6134,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6223,8 +6223,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6316,8 +6316,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6408,8 +6408,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6504,8 +6504,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6600,8 +6600,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6693,8 +6693,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6786,8 +6786,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6882,8 +6882,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -6978,8 +6978,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -7074,8 +7074,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -7170,8 +7170,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -7266,8 +7266,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -7362,8 +7362,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -7458,8 +7458,8 @@ ; GFX7-NEXT: s_add_u32 s0, s0, 16 ; GFX7-NEXT: s_addc_u32 s1, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v[0:1], v[2:3] ; GFX7-NEXT: s_endpgm @@ -7556,8 +7556,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7663,8 +7663,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7772,8 +7772,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7882,8 +7882,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -7994,8 +7994,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8106,8 +8106,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8215,8 +8215,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8324,8 +8324,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8436,8 +8436,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8548,8 +8548,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8660,8 +8660,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8772,8 +8772,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8884,8 +8884,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -8996,8 +8996,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -9108,8 +9108,8 @@ ; GFX7-NEXT: s_add_u32 s4, s0, 16 ; GFX7-NEXT: s_addc_u32 s5, s1, 0 ; GFX7-NEXT: v_mov_b32_e32 v0, s4 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s5 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: flat_atomic_cmpswap v2, v[0:1], v[2:3] glc ; GFX7-NEXT: v_mov_b32_e32 v0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -10,16 +10,16 @@ define amdgpu_kernel void @private_nontemporal_load_0( ; GFX6-LABEL: private_nontemporal_load_0: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] ; GFX6-NEXT: s_add_u32 s8, s8, s7 ; GFX6-NEXT: s_addc_u32 s9, s9, 0 -; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s6 ; GFX6-NEXT: buffer_load_dword v0, v0, s[8:11], 0 offen glc slc +; GFX6-NEXT: s_mov_b32 s3, 0x100f000 ; GFX6-NEXT: s_mov_b32 s2, -1 ; GFX6-NEXT: s_waitcnt vmcnt(0) ; GFX6-NEXT: buffer_store_dword v0, off, s[0:3], 0 @@ -135,10 +135,10 @@ define amdgpu_kernel void @private_nontemporal_load_1( ; GFX6-LABEL: private_nontemporal_load_1: ; GFX6: ; %bb.0: ; %entry -; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] ; GFX6-NEXT: s_mov_b64 s[8:9], s[0:1] ; GFX6-NEXT: s_load_dword s6, s[4:5], 0x0 ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x2 +; GFX6-NEXT: s_mov_b64 s[10:11], s[2:3] ; GFX6-NEXT: s_add_u32 s8, s8, s7 ; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_addc_u32 s9, s9, 0 @@ -270,9 +270,9 @@ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX6-NEXT: s_add_u32 s8, s8, s7 -; GFX6-NEXT: s_addc_u32 s9, s9, 0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: s_addc_u32 s9, s9, 0 ; GFX6-NEXT: v_mov_b32_e32 v1, s2 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: v_mov_b32_e32 v0, s0 @@ -286,9 +286,9 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_add_u32 s8, s8, s7 -; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -354,9 +354,9 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -370,9 +370,9 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0 ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v0, s0 @@ -393,9 +393,9 @@ ; GFX6-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX6-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX6-NEXT: s_add_u32 s8, s8, s7 -; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) ; GFX6-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX6-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX6-NEXT: s_addc_u32 s9, s9, 0 ; GFX6-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX6-NEXT: s_waitcnt lgkmcnt(0) @@ -410,9 +410,9 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_add_u32 s8, s8, s7 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) @@ -480,9 +480,9 @@ ; GFX90A-NOTTGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-NOTTGSPLIT-NEXT: s_add_u32 s8, s8, s7 -; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-NOTTGSPLIT-NEXT: s_addc_u32 s9, s9, 0 ; GFX90A-NOTTGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX90A-NOTTGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NOTTGSPLIT-NEXT: v_mov_b32_e32 v1, s0 @@ -496,9 +496,9 @@ ; GFX90A-TGSPLIT-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX90A-TGSPLIT-NEXT: s_load_dword s2, s[4:5], 0x8 ; GFX90A-TGSPLIT-NEXT: s_add_u32 s8, s8, s7 -; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX90A-TGSPLIT-NEXT: s_addc_u32 s9, s9, 0 ; GFX90A-TGSPLIT-NEXT: v_lshl_add_u32 v0, v0, 2, s2 ; GFX90A-TGSPLIT-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-TGSPLIT-NEXT: v_mov_b32_e32 v1, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -227,9 +227,9 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_add_u32 s8, s8, s7 -; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_mov_b32_e32 v1, s2 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v0, s0 @@ -326,9 +326,9 @@ ; GFX7-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 ; GFX7-NEXT: s_load_dword s2, s[4:5], 0x2 ; GFX7-NEXT: s_add_u32 s8, s8, s7 -; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: s_load_dword s0, s[0:1], 0x0 +; GFX7-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX7-NEXT: s_addc_u32 s9, s9, 0 ; GFX7-NEXT: v_add_i32_e32 v0, vcc, s2, v0 ; GFX7-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory_clause.ll b/llvm/test/CodeGen/AMDGPU/memory_clause.ll --- a/llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ b/llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -85,10 +85,10 @@ ; GCN-NEXT: v_mov_b32_e32 v2, s2 ; GCN-NEXT: v_mov_b32_e32 v3, s3 ; GCN-NEXT: v_mov_b32_e32 v4, s4 -; GCN-NEXT: v_mov_b32_e32 v8, s8 ; GCN-NEXT: v_mov_b32_e32 v5, s5 ; GCN-NEXT: v_mov_b32_e32 v6, s6 ; GCN-NEXT: v_mov_b32_e32 v7, s7 +; GCN-NEXT: v_mov_b32_e32 v8, s8 ; GCN-NEXT: v_mov_b32_e32 v9, s9 ; GCN-NEXT: v_mov_b32_e32 v10, s10 ; GCN-NEXT: v_mov_b32_e32 v11, s11 @@ -405,11 +405,11 @@ ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: buffer_load_dword v2, off, s[16:19], 0 offset:4 ; GCN-NEXT: s_brev_b32 s0, 1 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: s_mov_b32 s3, 0 ; GCN-NEXT: s_mov_b32 s1, s0 ; GCN-NEXT: s_mov_b32 s2, s0 +; GCN-NEXT: s_waitcnt lgkmcnt(0) +; GCN-NEXT: v_mov_b32_e32 v0, s14 ; GCN-NEXT: v_mov_b32_e32 v1, s15 ; GCN-NEXT: s_and_b64 exec, exec, s[12:13] ; GCN-NEXT: image_sample v0, v[0:1], s[4:11], s[0:3] dmask:0x1 diff --git a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/pal-simple-indirect-call.ll @@ -40,6 +40,7 @@ ; GFX9-NEXT: s_mov_b64 s[2:3], s[38:39] ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: s_endpgm +; ; GFX10-LABEL: test_simple_indirect_call: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_getpc_b64 s[36:37] @@ -51,8 +52,8 @@ ; GFX10-NEXT: s_bitset0_b32 s39, 21 ; GFX10-NEXT: s_add_u32 s36, s36, s0 ; GFX10-NEXT: s_addc_u32 s37, s37, 0 -; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_mov_b64 s[2:3], s[38:39] +; GFX10-NEXT: s_mov_b64 s[0:1], s[36:37] ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/saddo.ll b/llvm/test/CodeGen/AMDGPU/saddo.ll --- a/llvm/test/CodeGen/AMDGPU/saddo.ll +++ b/llvm/test/CodeGen/AMDGPU/saddo.ll @@ -294,14 +294,14 @@ ; SI-NEXT: s_mov_b32 s10, -1 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_add_u32 s12, s4, s6 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_addc_u32 s13, s5, s7 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: v_cmp_lt_i64_e32 vcc, s[12:13], v[0:1] ; SI-NEXT: v_cmp_lt_i64_e64 s[4:5], s[6:7], 0 -; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: s_mov_b32 s8, s0 ; SI-NEXT: s_mov_b32 s9, s1 +; SI-NEXT: v_mov_b32_e32 v0, s12 ; SI-NEXT: v_mov_b32_e32 v1, s13 ; SI-NEXT: s_xor_b64 s[4:5], s[4:5], vcc ; SI-NEXT: s_mov_b32 s0, s2 @@ -320,9 +320,9 @@ ; VI-NEXT: s_waitcnt lgkmcnt(0) ; VI-NEXT: v_mov_b32_e32 v0, s0 ; VI-NEXT: s_add_u32 s0, s4, s6 -; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v1, s1 ; VI-NEXT: s_addc_u32 s1, s5, s7 +; VI-NEXT: v_mov_b32_e32 v4, s4 ; VI-NEXT: v_mov_b32_e32 v5, s5 ; VI-NEXT: v_mov_b32_e32 v2, s2 ; VI-NEXT: v_mov_b32_e32 v3, s3 diff --git a/llvm/test/CodeGen/AMDGPU/sdiv64.ll b/llvm/test/CodeGen/AMDGPU/sdiv64.ll --- a/llvm/test/CodeGen/AMDGPU/sdiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/sdiv64.ll @@ -180,8 +180,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s18, s12, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: s_addc_u32 s19, s13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -212,10 +212,10 @@ ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 ; GCN-IR-NEXT: s_and_b64 s[18:19], s[14:15], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s18, s16, s18 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_subb_u32 s19, s17, s19 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] @@ -456,12 +456,12 @@ ; GCN-IR-NEXT: v_and_b32_e32 v20, v13, v3 ; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v2 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v16 ; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v0, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v12, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v20, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v16 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v17 +; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v16, v9 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 @@ -1041,8 +1041,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s18, s12, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: s_addc_u32 s19, s13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[18:19], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -1073,10 +1073,10 @@ ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 ; GCN-IR-NEXT: s_and_b64 s[18:19], s[14:15], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s18, s16, s18 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_subb_u32 s19, s17, s19 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] @@ -1256,8 +1256,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s12, s10, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_addc_u32 s13, s11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[12:13], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s9, 63, s10 @@ -1286,10 +1286,10 @@ ; GCN-IR-NEXT: s_and_b32 s6, s12, 1 ; GCN-IR-NEXT: s_and_b64 s[18:19], s[12:13], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s14, s14, s18 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s19 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[6:7] @@ -1475,8 +1475,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[10:11], 24, v10 ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 58, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while @@ -1496,12 +1496,12 @@ ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 @@ -1683,8 +1683,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[10:11], s[4:5], v10 ; GCN-IR-NEXT: v_sub_i32_e32 v8, vcc, 47, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v9, vcc, 0, v9, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while @@ -1704,12 +1704,12 @@ ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 @@ -1782,8 +1782,8 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[9:10], v[7:8], v9 ; GCN-IR-NEXT: v_add_i32_e32 v7, vcc, 0xffffffcf, v0 -; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v8, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff @@ -1803,13 +1803,13 @@ ; GCN-IR-NEXT: v_and_b32_e32 v5, 1, v9 ; GCN-IR-NEXT: v_and_b32_e32 v9, 0x8000, v9 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[11:12], v[7:8] -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v9, s[4:5], v0, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: v_subb_u32_e64 v10, s[4:5], v10, v13, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB13_3 diff --git a/llvm/test/CodeGen/AMDGPU/select64.ll b/llvm/test/CodeGen/AMDGPU/select64.ll --- a/llvm/test/CodeGen/AMDGPU/select64.ll +++ b/llvm/test/CodeGen/AMDGPU/select64.ll @@ -30,8 +30,8 @@ ; VI-NEXT: v_mov_b32_e32 v0, s2 ; VI-NEXT: s_cmp_lt_u32 s4, 6 ; VI-NEXT: s_cselect_b64 s[0:1], s[0:1], 0 -; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v1, s3 +; VI-NEXT: v_mov_b32_e32 v3, s1 ; VI-NEXT: v_mov_b32_e32 v2, s0 ; VI-NEXT: flat_store_dwordx2 v[0:1], v[2:3] ; VI-NEXT: s_endpgm @@ -262,8 +262,8 @@ ; GFX90A-NEXT: v_mov_b32_e32 v2, 0 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 -; GFX90A-NEXT: s_mov_b32 s4, 0 ; GFX90A-NEXT: s_cmp_gt_u32 s6, 5 +; GFX90A-NEXT: s_mov_b32 s4, 0 ; GFX90A-NEXT: s_mov_b32 s5, 63 ; GFX90A-NEXT: s_waitcnt lgkmcnt(0) ; GFX90A-NEXT: s_cselect_b64 s[0:1], s[0:1], s[4:5] diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-phys-copy.mir @@ -47,8 +47,8 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2 ; GFX9-LABEL: name: sgpr96_aligned_src_dst - ; GFX9: $sgpr8 = S_MOV_B32 $sgpr2, implicit $sgpr0_sgpr1_sgpr2, implicit-def $sgpr6_sgpr7_sgpr8 - ; GFX9: $sgpr6_sgpr7 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2 + ; GFX9: $sgpr8 = S_MOV_B32 $sgpr2 + ; GFX9: $sgpr6_sgpr7 = S_MOV_B64 $sgpr0_sgpr1 $sgpr6_sgpr7_sgpr8 = COPY $sgpr0_sgpr1_sgpr2 ... @@ -58,9 +58,9 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2 ; GFX9-LABEL: name: sgpr96_aligned_src - ; GFX9: $sgpr5 = S_MOV_B32 $sgpr2, implicit $sgpr0_sgpr1_sgpr2, implicit-def $sgpr3_sgpr4_sgpr5 - ; GFX9: $sgpr4 = S_MOV_B32 $sgpr1, implicit $sgpr0_sgpr1_sgpr2 - ; GFX9: $sgpr3 = S_MOV_B32 $sgpr0, implicit $sgpr0_sgpr1_sgpr2 + ; GFX9: $sgpr5 = S_MOV_B32 $sgpr2 + ; GFX9: $sgpr4 = S_MOV_B32 $sgpr1 + ; GFX9: $sgpr3 = S_MOV_B32 $sgpr0 $sgpr3_sgpr4_sgpr5 = COPY $sgpr0_sgpr1_sgpr2 ... @@ -70,9 +70,9 @@ bb.0: liveins: $sgpr3_sgpr4_sgpr5 ; GFX9-LABEL: name: sgpr96_aligned_dst - ; GFX9: $sgpr0 = S_MOV_B32 $sgpr3, implicit $sgpr3_sgpr4_sgpr5, implicit-def $sgpr0_sgpr1_sgpr2 - ; GFX9: $sgpr1 = S_MOV_B32 $sgpr4, implicit $sgpr3_sgpr4_sgpr5 - ; GFX9: $sgpr2 = S_MOV_B32 $sgpr5, implicit $sgpr3_sgpr4_sgpr5 + ; GFX9: $sgpr0 = S_MOV_B32 $sgpr3 + ; GFX9: $sgpr1 = S_MOV_B32 $sgpr4 + ; GFX9: $sgpr2 = S_MOV_B32 $sgpr5 $sgpr0_sgpr1_sgpr2 = COPY $sgpr3_sgpr4_sgpr5 ... @@ -82,8 +82,8 @@ bb.0: liveins: $sgpr3_sgpr4_sgpr5 ; GFX9-LABEL: name: sgpr96_unaligned_src_dst - ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr3_sgpr4_sgpr5, implicit-def $sgpr9_sgpr10_sgpr11 - ; GFX9: $sgpr9 = S_MOV_B32 $sgpr3, implicit $sgpr3_sgpr4_sgpr5 + ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9: $sgpr9 = S_MOV_B32 $sgpr3 $sgpr9_sgpr10_sgpr11 = COPY $sgpr3_sgpr4_sgpr5 ... @@ -93,8 +93,8 @@ bb.0: liveins: $sgpr3_sgpr4_sgpr5 ; GFX9-LABEL: name: sgpr96_killed - ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr3_sgpr4_sgpr5, implicit-def $sgpr9_sgpr10_sgpr11 - ; GFX9: $sgpr9 = S_MOV_B32 $sgpr3, implicit killed $sgpr3_sgpr4_sgpr5 + ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9: $sgpr9 = S_MOV_B32 killed $sgpr3 $sgpr9_sgpr10_sgpr11 = COPY killed $sgpr3_sgpr4_sgpr5 ... @@ -104,8 +104,8 @@ bb.0: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-LABEL: name: sgpr128_forward - ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7 $sgpr0_sgpr1_sgpr2_sgpr3 = COPY $sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -115,8 +115,8 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3 ; GFX9-LABEL: name: sgpr128_backward - ; GFX9: $sgpr6_sgpr7 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GFX9: $sgpr6_sgpr7 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9: $sgpr4_sgpr5 = S_MOV_B64 $sgpr0_sgpr1 $sgpr4_sgpr5_sgpr6_sgpr7 = COPY $sgpr0_sgpr1_sgpr2_sgpr3 ... @@ -126,8 +126,8 @@ bb.0: liveins: $sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-LABEL: name: sgpr128_killed - ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3 - ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr6_sgpr7, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 killed $sgpr6_sgpr7 $sgpr0_sgpr1_sgpr2_sgpr3 = COPY killed $sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -137,9 +137,9 @@ bb.0: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; GFX9-LABEL: name: sgpr160_forward - ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; GFX9: $sgpr4 = S_MOV_B32 $sgpr12, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9: $sgpr4 = S_MOV_B32 $sgpr12 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ... @@ -149,9 +149,9 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ; GFX9-LABEL: name: sgpr160_backward - ; GFX9: $sgpr12 = S_MOV_B32 $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; GFX9: $sgpr12 = S_MOV_B32 $sgpr4 + ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ... @@ -161,9 +161,9 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ; GFX9-LABEL: name: sgpr160_killed - ; GFX9: $sgpr12 = S_MOV_B32 $sgpr4, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 - ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 + ; GFX9: $sgpr12 = S_MOV_B32 killed $sgpr4 + ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4 ... @@ -174,9 +174,9 @@ bb.0: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; GFX9-LABEL: name: sgpr192_forward - ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; GFX9: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ... @@ -186,9 +186,9 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX9-LABEL: name: sgpr192_backward - ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ... @@ -198,9 +198,9 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ; GFX9-LABEL: name: sgpr192_killed - ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 - ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 + ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5 ... @@ -210,10 +210,10 @@ bb.0: liveins: $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9-LABEL: name: sgpr256_forward - ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr6_sgpr7 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9: $sgpr4_sgpr5 = S_MOV_B64 $sgpr12_sgpr13 + ; GFX9: $sgpr6_sgpr7 = S_MOV_B64 $sgpr14_sgpr15 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 = COPY $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ... @@ -223,10 +223,10 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-LABEL: name: sgpr256_backward - ; GFX9: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7 + ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -236,10 +236,10 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ; GFX9-LABEL: name: sgpr256_killed - ; GFX9: $sgpr14_sgpr15 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7, implicit-def $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 - ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 + ; GFX9: $sgpr14_sgpr15 = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7 ... @@ -249,14 +249,14 @@ bb.0: liveins: $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ; GFX9-LABEL: name: sgpr512_forward - ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr18_sgpr19, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr4_sgpr5 = S_MOV_B64 $sgpr20_sgpr21, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr6_sgpr7 = S_MOV_B64 $sgpr22_sgpr23, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr14_sgpr15 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr16_sgpr17 + ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr18_sgpr19 + ; GFX9: $sgpr4_sgpr5 = S_MOV_B64 $sgpr20_sgpr21 + ; GFX9: $sgpr6_sgpr7 = S_MOV_B64 $sgpr22_sgpr23 + ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr24_sgpr25 + ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr26_sgpr27 + ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 $sgpr28_sgpr29 + ; GFX9: $sgpr14_sgpr15 = S_MOV_B64 $sgpr30_sgpr31 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 = COPY $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ... @@ -266,14 +266,14 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9-LABEL: name: sgpr512_backward - ; GFX9: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr22_sgpr23 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr20_sgpr21 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr18_sgpr19 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr16_sgpr17 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX9: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15 + ; GFX9: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13 + ; GFX9: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9: $sgpr22_sgpr23 = S_MOV_B64 $sgpr6_sgpr7 + ; GFX9: $sgpr20_sgpr21 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9: $sgpr18_sgpr19 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9: $sgpr16_sgpr17 = S_MOV_B64 $sgpr0_sgpr1 $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ... @@ -283,14 +283,14 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; GFX9-LABEL: name: sgpr512_killed - ; GFX9: $sgpr30_sgpr31 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit-def $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr28_sgpr29 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr26_sgpr27 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr24_sgpr25 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr22_sgpr23 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr20_sgpr21 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr18_sgpr19 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; GFX9: $sgpr16_sgpr17 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; GFX9: $sgpr30_sgpr31 = S_MOV_B64 killed $sgpr14_sgpr15 + ; GFX9: $sgpr28_sgpr29 = S_MOV_B64 killed $sgpr12_sgpr13 + ; GFX9: $sgpr26_sgpr27 = S_MOV_B64 killed $sgpr10_sgpr11 + ; GFX9: $sgpr24_sgpr25 = S_MOV_B64 killed $sgpr8_sgpr9 + ; GFX9: $sgpr22_sgpr23 = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9: $sgpr20_sgpr21 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9: $sgpr18_sgpr19 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9: $sgpr16_sgpr17 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ... @@ -300,22 +300,22 @@ bb.0: liveins: $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ; GFX9-LABEL: name: sgpr1024_forward - ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr32_sgpr33, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr34_sgpr35, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr4_sgpr5 = S_MOV_B64 $sgpr36_sgpr37, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr6_sgpr7 = S_MOV_B64 $sgpr38_sgpr39, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr40_sgpr41, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr42_sgpr43, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 $sgpr44_sgpr45, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr14_sgpr15 = S_MOV_B64 $sgpr46_sgpr47, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr16_sgpr17 = S_MOV_B64 $sgpr48_sgpr49, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr18_sgpr19 = S_MOV_B64 $sgpr50_sgpr51, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr20_sgpr21 = S_MOV_B64 $sgpr52_sgpr53, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr22_sgpr23 = S_MOV_B64 $sgpr54_sgpr55, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr24_sgpr25 = S_MOV_B64 $sgpr56_sgpr57, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr26_sgpr27 = S_MOV_B64 $sgpr58_sgpr59, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr28_sgpr29 = S_MOV_B64 $sgpr60_sgpr61, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr30_sgpr31 = S_MOV_B64 $sgpr62_sgpr63, implicit $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 + ; GFX9: $sgpr0_sgpr1 = S_MOV_B64 $sgpr32_sgpr33 + ; GFX9: $sgpr2_sgpr3 = S_MOV_B64 $sgpr34_sgpr35 + ; GFX9: $sgpr4_sgpr5 = S_MOV_B64 $sgpr36_sgpr37 + ; GFX9: $sgpr6_sgpr7 = S_MOV_B64 $sgpr38_sgpr39 + ; GFX9: $sgpr8_sgpr9 = S_MOV_B64 $sgpr40_sgpr41 + ; GFX9: $sgpr10_sgpr11 = S_MOV_B64 $sgpr42_sgpr43 + ; GFX9: $sgpr12_sgpr13 = S_MOV_B64 $sgpr44_sgpr45 + ; GFX9: $sgpr14_sgpr15 = S_MOV_B64 $sgpr46_sgpr47 + ; GFX9: $sgpr16_sgpr17 = S_MOV_B64 $sgpr48_sgpr49 + ; GFX9: $sgpr18_sgpr19 = S_MOV_B64 $sgpr50_sgpr51 + ; GFX9: $sgpr20_sgpr21 = S_MOV_B64 $sgpr52_sgpr53 + ; GFX9: $sgpr22_sgpr23 = S_MOV_B64 $sgpr54_sgpr55 + ; GFX9: $sgpr24_sgpr25 = S_MOV_B64 $sgpr56_sgpr57 + ; GFX9: $sgpr26_sgpr27 = S_MOV_B64 $sgpr58_sgpr59 + ; GFX9: $sgpr28_sgpr29 = S_MOV_B64 $sgpr60_sgpr61 + ; GFX9: $sgpr30_sgpr31 = S_MOV_B64 $sgpr62_sgpr63 $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = COPY $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 ... @@ -325,22 +325,22 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ; GFX9-LABEL: name: sgpr1024_backward - ; GFX9: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr54_sgpr55 = S_MOV_B64 $sgpr22_sgpr23, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr52_sgpr53 = S_MOV_B64 $sgpr20_sgpr21, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr50_sgpr51 = S_MOV_B64 $sgpr18_sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr48_sgpr49 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr46_sgpr47 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr44_sgpr45 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr42_sgpr43 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr40_sgpr41 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr38_sgpr39 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr36_sgpr37 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr34_sgpr35 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr32_sgpr33 = S_MOV_B64 $sgpr0_sgpr1, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GFX9: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31 + ; GFX9: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29 + ; GFX9: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27 + ; GFX9: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25 + ; GFX9: $sgpr54_sgpr55 = S_MOV_B64 $sgpr22_sgpr23 + ; GFX9: $sgpr52_sgpr53 = S_MOV_B64 $sgpr20_sgpr21 + ; GFX9: $sgpr50_sgpr51 = S_MOV_B64 $sgpr18_sgpr19 + ; GFX9: $sgpr48_sgpr49 = S_MOV_B64 $sgpr16_sgpr17 + ; GFX9: $sgpr46_sgpr47 = S_MOV_B64 $sgpr14_sgpr15 + ; GFX9: $sgpr44_sgpr45 = S_MOV_B64 $sgpr12_sgpr13 + ; GFX9: $sgpr42_sgpr43 = S_MOV_B64 $sgpr10_sgpr11 + ; GFX9: $sgpr40_sgpr41 = S_MOV_B64 $sgpr8_sgpr9 + ; GFX9: $sgpr38_sgpr39 = S_MOV_B64 $sgpr6_sgpr7 + ; GFX9: $sgpr36_sgpr37 = S_MOV_B64 $sgpr4_sgpr5 + ; GFX9: $sgpr34_sgpr35 = S_MOV_B64 $sgpr2_sgpr3 + ; GFX9: $sgpr32_sgpr33 = S_MOV_B64 $sgpr0_sgpr1 $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ... @@ -350,21 +350,21 @@ bb.0: liveins: $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ; GFX9-LABEL: name: sgpr1024_killed - ; GFX9: $sgpr62_sgpr63 = S_MOV_B64 $sgpr30_sgpr31, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 - ; GFX9: $sgpr60_sgpr61 = S_MOV_B64 $sgpr28_sgpr29, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr58_sgpr59 = S_MOV_B64 $sgpr26_sgpr27, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr56_sgpr57 = S_MOV_B64 $sgpr24_sgpr25, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr54_sgpr55 = S_MOV_B64 $sgpr22_sgpr23, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr52_sgpr53 = S_MOV_B64 $sgpr20_sgpr21, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr50_sgpr51 = S_MOV_B64 $sgpr18_sgpr19, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr48_sgpr49 = S_MOV_B64 $sgpr16_sgpr17, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr46_sgpr47 = S_MOV_B64 $sgpr14_sgpr15, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr44_sgpr45 = S_MOV_B64 $sgpr12_sgpr13, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr42_sgpr43 = S_MOV_B64 $sgpr10_sgpr11, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr40_sgpr41 = S_MOV_B64 $sgpr8_sgpr9, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr38_sgpr39 = S_MOV_B64 $sgpr6_sgpr7, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr36_sgpr37 = S_MOV_B64 $sgpr4_sgpr5, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr34_sgpr35 = S_MOV_B64 $sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 - ; GFX9: $sgpr32_sgpr33 = S_MOV_B64 $sgpr0_sgpr1, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 + ; GFX9: $sgpr62_sgpr63 = S_MOV_B64 killed $sgpr30_sgpr31 + ; GFX9: $sgpr60_sgpr61 = S_MOV_B64 killed $sgpr28_sgpr29 + ; GFX9: $sgpr58_sgpr59 = S_MOV_B64 killed $sgpr26_sgpr27 + ; GFX9: $sgpr56_sgpr57 = S_MOV_B64 killed $sgpr24_sgpr25 + ; GFX9: $sgpr54_sgpr55 = S_MOV_B64 killed $sgpr22_sgpr23 + ; GFX9: $sgpr52_sgpr53 = S_MOV_B64 killed $sgpr20_sgpr21 + ; GFX9: $sgpr50_sgpr51 = S_MOV_B64 killed $sgpr18_sgpr19 + ; GFX9: $sgpr48_sgpr49 = S_MOV_B64 killed $sgpr16_sgpr17 + ; GFX9: $sgpr46_sgpr47 = S_MOV_B64 killed $sgpr14_sgpr15 + ; GFX9: $sgpr44_sgpr45 = S_MOV_B64 killed $sgpr12_sgpr13 + ; GFX9: $sgpr42_sgpr43 = S_MOV_B64 killed $sgpr10_sgpr11 + ; GFX9: $sgpr40_sgpr41 = S_MOV_B64 killed $sgpr8_sgpr9 + ; GFX9: $sgpr38_sgpr39 = S_MOV_B64 killed $sgpr6_sgpr7 + ; GFX9: $sgpr36_sgpr37 = S_MOV_B64 killed $sgpr4_sgpr5 + ; GFX9: $sgpr34_sgpr35 = S_MOV_B64 killed $sgpr2_sgpr3 + ; GFX9: $sgpr32_sgpr33 = S_MOV_B64 killed $sgpr0_sgpr1 $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63 = COPY killed $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 ... diff --git a/llvm/test/CodeGen/AMDGPU/shift-i128.ll b/llvm/test/CodeGen/AMDGPU/shift-i128.ll --- a/llvm/test/CodeGen/AMDGPU/shift-i128.ll +++ b/llvm/test/CodeGen/AMDGPU/shift-i128.ll @@ -262,6 +262,7 @@ ; GCN-LABEL: s_ashr_i128_ss: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx8 s[4:11], s[4:5], 0x0 +; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_ashr_i32 s2, s7, 31 ; GCN-NEXT: s_ashr_i64 s[0:1], s[6:7], s8 @@ -290,7 +291,6 @@ ; GCN-NEXT: v_cndmask_b32_e32 v0, v0, v4, vcc ; GCN-NEXT: v_mov_b32_e32 v6, s4 ; GCN-NEXT: v_mov_b32_e32 v4, 0 -; GCN-NEXT: v_mov_b32_e32 v5, 0 ; GCN-NEXT: v_cndmask_b32_e64 v0, v0, v6, s[0:1] ; GCN-NEXT: flat_store_dwordx4 v[4:5], v[0:3] ; GCN-NEXT: s_endpgm @@ -454,8 +454,8 @@ ; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 @@ -524,8 +524,8 @@ ; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 @@ -594,8 +594,8 @@ ; GCN-NEXT: s_load_dwordx8 s[16:23], s[4:5], 0x8 ; GCN-NEXT: s_load_dwordx8 s[8:15], s[4:5], 0x0 ; GCN-NEXT: v_mov_b32_e32 v10, 16 -; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: v_mov_b32_e32 v11, 0 +; GCN-NEXT: v_mov_b32_e32 v8, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_cmp_lt_u64_e64 s[0:1], s[16:17], 64 ; GCN-NEXT: v_cmp_eq_u64_e64 s[2:3], s[18:19], 0 diff --git a/llvm/test/CodeGen/AMDGPU/shl.ll b/llvm/test/CodeGen/AMDGPU/shl.ll --- a/llvm/test/CodeGen/AMDGPU/shl.ll +++ b/llvm/test/CodeGen/AMDGPU/shl.ll @@ -1117,11 +1117,11 @@ ; SI-NEXT: s_load_dwordx4 s[8:11], s[0:1], 0x9 ; SI-NEXT: s_ashr_i32 s3, s2, 31 ; SI-NEXT: s_lshl_b64 s[0:1], s[2:3], 3 -; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: s_mov_b32 s7, 0xf000 ; SI-NEXT: s_mov_b32 s6, 0 ; SI-NEXT: s_waitcnt lgkmcnt(0) ; SI-NEXT: s_mov_b64 s[4:5], s[10:11] +; SI-NEXT: v_mov_b32_e32 v0, s0 ; SI-NEXT: v_mov_b32_e32 v1, s1 ; SI-NEXT: buffer_load_dword v3, v[0:1], s[4:7], 0 addr64 ; SI-NEXT: s_mov_b64 s[10:11], s[6:7] diff --git a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/sint_to_fp.i64.ll @@ -278,6 +278,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_xor_b32 s7, s2, s3 ; GFX8-NEXT: s_flbit_i32 s6, s3 ; GFX8-NEXT: s_ashr_i32 s7, s7, 31 @@ -303,7 +304,6 @@ ; GFX8-NEXT: s_sub_i32 s0, 32, s2 ; GFX8-NEXT: v_ldexp_f32 v0, v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm %result = sitofp <2 x i64> %in to <2 x float> diff --git a/llvm/test/CodeGen/AMDGPU/srem64.ll b/llvm/test/CodeGen/AMDGPU/srem64.ll --- a/llvm/test/CodeGen/AMDGPU/srem64.ll +++ b/llvm/test/CodeGen/AMDGPU/srem64.ll @@ -153,8 +153,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s10, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_addc_u32 s15, s11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 @@ -185,10 +185,10 @@ ; GCN-IR-NEXT: s_and_b32 s2, s12, 1 ; GCN-IR-NEXT: s_and_b64 s[18:19], s[12:13], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s14, s14, s18 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s19 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -410,8 +410,8 @@ ; GCN-IR-NEXT: v_lshr_b64 v[14:15], v[0:1], v14 ; GCN-IR-NEXT: v_not_b32_e32 v9, v11 ; GCN-IR-NEXT: v_add_i32_e32 v11, vcc, v3, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v12, vcc, v9, v13, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v16, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v17, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while @@ -431,12 +431,12 @@ ; GCN-IR-NEXT: v_and_b32_e32 v20, v13, v6 ; GCN-IR-NEXT: v_and_b32_e32 v13, v13, v5 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[16:17], v[11:12] -; GCN-IR-NEXT: v_mov_b32_e32 v11, v16 ; GCN-IR-NEXT: v_sub_i32_e64 v14, s[4:5], v3, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v12, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 ; GCN-IR-NEXT: v_subb_u32_e64 v15, s[4:5], v15, v20, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v11, v16 +; GCN-IR-NEXT: v_mov_b32_e32 v12, v17 +; GCN-IR-NEXT: v_mov_b32_e32 v17, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v16, v9 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 @@ -1051,8 +1051,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s12, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -1083,10 +1083,10 @@ ; GCN-IR-NEXT: s_and_b32 s6, s14, 1 ; GCN-IR-NEXT: s_and_b64 s[20:21], s[14:15], s[8:9] ; GCN-IR-NEXT: s_sub_u32 s16, s16, s20 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_subb_u32 s17, s17, s21 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[6:7] @@ -1213,8 +1213,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB9_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s16, s12, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: s_addc_u32 s17, s13, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s12 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s13 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[16:17], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s12, 63, s12 @@ -1245,10 +1245,10 @@ ; GCN-IR-NEXT: s_and_b32 s8, s14, 1 ; GCN-IR-NEXT: s_and_b64 s[20:21], s[14:15], s[6:7] ; GCN-IR-NEXT: s_sub_u32 s16, s16, s20 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_subb_u32 s17, s17, s21 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_add_u32 s10, s10, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: s_addc_u32 s11, s11, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[14:15], s[8:9] @@ -1430,8 +1430,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB10_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s7, 63, s8 @@ -1460,10 +1460,10 @@ ; GCN-IR-NEXT: s_and_b32 s2, s10, 1 ; GCN-IR-NEXT: s_and_b64 s[16:17], s[10:11], s[4:5] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s16 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s17 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] @@ -1646,8 +1646,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[8:9], 24, v8 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 58, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: BB11_3: ; %udiv-do-while @@ -1667,12 +1667,12 @@ ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB11_3 @@ -1852,8 +1852,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while @@ -1873,12 +1873,12 @@ ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 @@ -1957,8 +1957,8 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[10:11], v[0:1], v9 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, 0xffffffcf, v8 -; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v9, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v13, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff @@ -1978,13 +1978,13 @@ ; GCN-IR-NEXT: v_or_b32_e32 v5, v13, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[12:13], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 ; GCN-IR-NEXT: v_mov_b32_e32 v14, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v10, s[4:5], v10, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v11, s[4:5], v11, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v12 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v13 +; GCN-IR-NEXT: v_mov_b32_e32 v13, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v12, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB13_3 diff --git a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll --- a/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-pointer-offset-relative-frameindex.ll @@ -14,11 +14,12 @@ ; MUBUF-NEXT: s_mov_b32 s38, -1 ; MUBUF-NEXT: s_mov_b32 s39, 0x31c16000 ; MUBUF-NEXT: s_add_u32 s36, s36, s11 -; MUBUF-NEXT: s_addc_u32 s37, s37, 0 ; MUBUF-NEXT: v_mov_b32_e32 v1, 0x2000 ; MUBUF-NEXT: v_mov_b32_e32 v2, 0x4000 ; MUBUF-NEXT: v_mov_b32_e32 v3, 0 ; MUBUF-NEXT: v_mov_b32_e32 v4, 0x400000 +; MUBUF-NEXT: s_addc_u32 s37, s37, 0 +; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] ; MUBUF-NEXT: s_mov_b32 s32, 0xc0000 ; MUBUF-NEXT: v_add_nc_u32_e64 v40, 4, 0x4000 ; MUBUF-NEXT: s_getpc_b64 s[4:5] @@ -27,7 +28,6 @@ ; MUBUF-NEXT: s_waitcnt lgkmcnt(0) ; MUBUF-NEXT: v_mov_b32_e32 v0, s0 ; MUBUF-NEXT: s_mov_b64 s[0:1], s[36:37] -; MUBUF-NEXT: s_mov_b64 s[2:3], s[38:39] ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 ; MUBUF-NEXT: s_and_saveexec_b32 s0, vcc_lo diff --git a/llvm/test/CodeGen/AMDGPU/store-local.128.ll b/llvm/test/CodeGen/AMDGPU/store-local.128.ll --- a/llvm/test/CodeGen/AMDGPU/store-local.128.ll +++ b/llvm/test/CodeGen/AMDGPU/store-local.128.ll @@ -436,8 +436,8 @@ ; GFX9-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-NEXT: v_mov_b32_e32 v4, s2 ; GFX9-NEXT: v_mov_b32_e32 v0, s4 -; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v1, s5 +; GFX9-NEXT: v_mov_b32_e32 v2, s6 ; GFX9-NEXT: v_mov_b32_e32 v3, s7 ; GFX9-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX9-NEXT: s_endpgm @@ -450,8 +450,8 @@ ; GFX7-NEXT: s_waitcnt lgkmcnt(0) ; GFX7-NEXT: v_mov_b32_e32 v4, s4 ; GFX7-NEXT: v_mov_b32_e32 v0, s0 -; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v1, s1 +; GFX7-NEXT: v_mov_b32_e32 v2, s2 ; GFX7-NEXT: v_mov_b32_e32 v3, s3 ; GFX7-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX7-NEXT: s_endpgm @@ -478,8 +478,8 @@ ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: v_mov_b32_e32 v4, s2 ; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v2, s6 ; GFX10-NEXT: v_mov_b32_e32 v3, s7 ; GFX10-NEXT: ds_write2_b64 v4, v[0:1], v[2:3] offset1:1 ; GFX10-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll --- a/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll +++ b/llvm/test/CodeGen/AMDGPU/store-weird-sizes.ll @@ -196,11 +196,11 @@ ; HAWAII-NEXT: s_mov_b32 m0, -1 ; HAWAII-NEXT: s_waitcnt lgkmcnt(0) ; HAWAII-NEXT: v_mov_b32_e32 v2, s2 +; HAWAII-NEXT: v_mov_b32_e32 v1, s1 ; HAWAII-NEXT: s_and_b32 s3, s3, 1 ; HAWAII-NEXT: v_mov_b32_e32 v0, s3 ; HAWAII-NEXT: ds_write_b8 v2, v0 offset:8 ; HAWAII-NEXT: v_mov_b32_e32 v0, s0 -; HAWAII-NEXT: v_mov_b32_e32 v1, s1 ; HAWAII-NEXT: ds_write_b64 v2, v[0:1] ; HAWAII-NEXT: s_endpgm ; @@ -212,11 +212,11 @@ ; FIJI-NEXT: s_mov_b32 m0, -1 ; FIJI-NEXT: s_waitcnt lgkmcnt(0) ; FIJI-NEXT: v_mov_b32_e32 v2, s2 +; FIJI-NEXT: v_mov_b32_e32 v1, s1 ; FIJI-NEXT: s_and_b32 s3, s3, 1 ; FIJI-NEXT: v_mov_b32_e32 v0, s3 ; FIJI-NEXT: ds_write_b8 v2, v0 offset:8 ; FIJI-NEXT: v_mov_b32_e32 v0, s0 -; FIJI-NEXT: v_mov_b32_e32 v1, s1 ; FIJI-NEXT: ds_write_b64 v2, v[0:1] ; FIJI-NEXT: s_endpgm ; diff --git a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll --- a/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll +++ b/llvm/test/CodeGen/AMDGPU/subreg-coalescer-undef-use.ll @@ -10,18 +10,15 @@ ; CHECK-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x9 ; CHECK-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0xb ; CHECK-NEXT: v_mbcnt_lo_u32_b32_e64 v0, -1, 0 -; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_mov_b32 s2, -1 +; CHECK-NEXT: v_cmp_eq_u32_e32 vcc, 0, v0 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: v_mov_b32_e32 v1, s5 -; CHECK-NEXT: v_mov_b32_e32 v2, s6 -; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: s_and_saveexec_b64 s[6:7], vcc ; CHECK-NEXT: ; %bb.1: ; %ift ; CHECK-NEXT: s_mov_b32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, s4 -; CHECK-NEXT: v_mov_b32_e32 v1, s5 ; CHECK-NEXT: v_mov_b32_e32 v2, s6 ; CHECK-NEXT: v_mov_b32_e32 v3, s7 ; CHECK-NEXT: ; %bb.2: ; %ife diff --git a/llvm/test/CodeGen/AMDGPU/trap-abis.ll b/llvm/test/CodeGen/AMDGPU/trap-abis.ll --- a/llvm/test/CodeGen/AMDGPU/trap-abis.ll +++ b/llvm/test/CodeGen/AMDGPU/trap-abis.ll @@ -489,8 +489,8 @@ ; HSA-TRAP-GFX803-V2-NEXT: s_and_b64 vcc, exec, vcc ; HSA-TRAP-GFX803-V2-NEXT: s_cbranch_vccz BB1_2 ; HSA-TRAP-GFX803-V2-NEXT: ; %bb.1: ; %ret -; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v2, 3 +; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-V2-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-V2-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-V2-NEXT: s_waitcnt vmcnt(0) @@ -511,8 +511,8 @@ ; HSA-TRAP-GFX803-V3-NEXT: s_and_b64 vcc, exec, vcc ; HSA-TRAP-GFX803-V3-NEXT: s_cbranch_vccz BB1_2 ; HSA-TRAP-GFX803-V3-NEXT: ; %bb.1: ; %ret -; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v2, 3 +; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-V3-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-V3-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-V3-NEXT: s_waitcnt vmcnt(0) @@ -533,8 +533,8 @@ ; HSA-TRAP-GFX803-V4-NEXT: s_and_b64 vcc, exec, vcc ; HSA-TRAP-GFX803-V4-NEXT: s_cbranch_vccz BB1_2 ; HSA-TRAP-GFX803-V4-NEXT: ; %bb.1: ; %ret -; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v2, 3 +; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v0, s0 ; HSA-TRAP-GFX803-V4-NEXT: v_mov_b32_e32 v1, s1 ; HSA-TRAP-GFX803-V4-NEXT: flat_store_dword v[0:1], v2 ; HSA-TRAP-GFX803-V4-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/udiv64.ll b/llvm/test/CodeGen/AMDGPU/udiv64.ll --- a/llvm/test/CodeGen/AMDGPU/udiv64.ll +++ b/llvm/test/CodeGen/AMDGPU/udiv64.ll @@ -154,8 +154,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s8, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_addc_u32 s15, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -186,10 +186,10 @@ ; GCN-IR-NEXT: s_and_b32 s2, s10, 1 ; GCN-IR-NEXT: s_and_b64 s[14:15], s[10:11], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s14, s12, s14 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_subb_u32 s15, s13, s15 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[2:3] @@ -376,8 +376,8 @@ ; GCN-IR-NEXT: v_not_b32_e32 v0, v8 ; GCN-IR-NEXT: v_not_b32_e32 v1, v9 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, v0, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_addc_u32_e32 v1, vcc, v1, v11, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v8, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v7, 0 ; GCN-IR-NEXT: BB1_3: ; %udiv-do-while @@ -397,12 +397,12 @@ ; GCN-IR-NEXT: v_or_b32_e32 v5, v9, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[8:9], v[0:1] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v8 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v10, v12 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v9 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v11, v13, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v8 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v8, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 @@ -851,8 +851,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s8, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_addc_u32 s15, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s8, 63, s8 @@ -883,10 +883,10 @@ ; GCN-IR-NEXT: s_and_b32 s0, s10, 1 ; GCN-IR-NEXT: s_and_b64 s[14:15], s[10:11], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s14, s12, s14 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_subb_u32 s15, s13, s15 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[0:1] @@ -1047,8 +1047,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB8_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s7, 63, s8 @@ -1077,10 +1077,10 @@ ; GCN-IR-NEXT: s_and_b32 s4, s10, 1 ; GCN-IR-NEXT: s_and_b64 s[16:17], s[10:11], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s16 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s17 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -1249,8 +1249,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: BB9_3: ; %udiv-do-while @@ -1270,12 +1270,12 @@ ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 @@ -1337,8 +1337,8 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffcf, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff @@ -1358,13 +1358,13 @@ ; GCN-IR-NEXT: v_or_b32_e32 v3, v10, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v10, vcc, 0, v1, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 ; GCN-IR-NEXT: v_mov_b32_e32 v6, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v8, s[4:5], v8, v6, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB10_3 @@ -1514,8 +1514,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB11_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s6, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_addc_u32 s11, s7, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s6, 63, s6 @@ -1541,10 +1541,10 @@ ; GCN-IR-NEXT: s_and_b32 s4, s8, 1 ; GCN-IR-NEXT: s_and_b32 s8, s8, 24 ; GCN-IR-NEXT: s_sub_u32 s10, s10, s8 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: s_subb_u32 s11, s11, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s2 ; GCN-IR-NEXT: s_add_u32 s2, s2, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s3 ; GCN-IR-NEXT: s_addc_u32 s3, s3, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[2:3], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[8:9], s[4:5] @@ -1707,8 +1707,8 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[7:8], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v0, vcc, 0xffffffc4, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v1, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v9, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: BB12_3: ; %udiv-do-while @@ -1727,12 +1727,12 @@ ; GCN-IR-NEXT: v_and_b32_e32 v4, 1, v7 ; GCN-IR-NEXT: v_and_b32_e32 v7, 24, v7 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[9:10], v[0:1] -; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 ; GCN-IR-NEXT: v_sub_i32_e64 v7, s[4:5], v6, v7 -; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 -; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_subbrev_u32_e64 v8, s[4:5], 0, v8, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v0, v9 +; GCN-IR-NEXT: v_mov_b32_e32 v1, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v10, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v9, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB12_3 diff --git a/llvm/test/CodeGen/AMDGPU/udivrem.ll b/llvm/test/CodeGen/AMDGPU/udivrem.ll --- a/llvm/test/CodeGen/AMDGPU/udivrem.ll +++ b/llvm/test/CodeGen/AMDGPU/udivrem.ll @@ -102,8 +102,8 @@ ; GFX8-NEXT: v_cndmask_b32_e64 v2, v2, v4, s[0:1] ; GFX8-NEXT: v_subrev_u32_e32 v4, vcc, s6, v3 ; GFX8-NEXT: flat_store_dword v[0:1], v2 -; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_cndmask_b32_e64 v2, v3, v4, s[0:1] +; GFX8-NEXT: v_mov_b32_e32 v0, s4 ; GFX8-NEXT: v_mov_b32_e32 v1, s5 ; GFX8-NEXT: flat_store_dword v[0:1], v2 ; GFX8-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll --- a/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll +++ b/llvm/test/CodeGen/AMDGPU/uint_to_fp.i64.ll @@ -238,6 +238,7 @@ ; GFX8-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX8-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x34 ; GFX8-NEXT: s_waitcnt lgkmcnt(0) +; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: s_flbit_i32_b32 s6, s3 ; GFX8-NEXT: s_flbit_i32_b32 s7, s1 ; GFX8-NEXT: s_min_u32 s6, s6, 32 @@ -255,7 +256,6 @@ ; GFX8-NEXT: s_sub_i32 s0, 32, s7 ; GFX8-NEXT: v_ldexp_f32 v0, v2, s0 ; GFX8-NEXT: v_mov_b32_e32 v2, s4 -; GFX8-NEXT: v_mov_b32_e32 v3, s5 ; GFX8-NEXT: flat_store_dwordx2 v[2:3], v[0:1] ; GFX8-NEXT: s_endpgm %result = uitofp <2 x i64> %in to <2 x float> diff --git a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll --- a/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll +++ b/llvm/test/CodeGen/AMDGPU/undefined-subreg-liverange.ll @@ -100,8 +100,6 @@ ; CHECK-NEXT: ;;#ASMEND ; CHECK-NEXT: v_mov_b32_e32 v0, v5 ; CHECK-NEXT: v_mov_b32_e32 v1, v6 -; CHECK-NEXT: v_mov_b32_e32 v2, v7 -; CHECK-NEXT: v_mov_b32_e32 v3, v8 ; CHECK-NEXT: s_mov_b32 s3, 0xf000 ; CHECK-NEXT: s_mov_b32 s2, -1 ; CHECK-NEXT: v_mov_b32_e32 v0, v6 diff --git a/llvm/test/CodeGen/AMDGPU/urem64.ll b/llvm/test/CodeGen/AMDGPU/urem64.ll --- a/llvm/test/CodeGen/AMDGPU/urem64.ll +++ b/llvm/test/CodeGen/AMDGPU/urem64.ll @@ -153,8 +153,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB0_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s14, s10, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: s_addc_u32 s15, s11, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s10 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s11 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[14:15], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s10, 63, s10 @@ -185,10 +185,10 @@ ; GCN-IR-NEXT: s_and_b32 s2, s12, 1 ; GCN-IR-NEXT: s_and_b64 s[18:19], s[12:13], s[0:1] ; GCN-IR-NEXT: s_sub_u32 s14, s14, s18 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_subb_u32 s15, s15, s19 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_add_u32 s8, s8, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: s_addc_u32 s9, s9, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[8:9], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[12:13], s[2:3] @@ -382,9 +382,9 @@ ; GCN-IR-NEXT: v_add_i32_e32 v14, vcc, -1, v2 ; GCN-IR-NEXT: v_addc_u32_e32 v15, vcc, -1, v3, vcc ; GCN-IR-NEXT: v_not_b32_e32 v6, v8 +; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 ; GCN-IR-NEXT: v_not_b32_e32 v7, v9 ; GCN-IR-NEXT: v_add_i32_e32 v8, vcc, v6, v10 -; GCN-IR-NEXT: v_lshr_b64 v[12:13], v[0:1], v12 ; GCN-IR-NEXT: v_addc_u32_e32 v9, vcc, v7, v11, vcc ; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 @@ -406,12 +406,12 @@ ; GCN-IR-NEXT: v_or_b32_e32 v5, v11, v5 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v9, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[8:9] -; GCN-IR-NEXT: v_mov_b32_e32 v8, v10 ; GCN-IR-NEXT: v_sub_i32_e64 v12, s[4:5], v12, v17 -; GCN-IR-NEXT: v_mov_b32_e32 v9, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: v_subb_u32_e64 v13, s[4:5], v13, v16, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v8, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v9, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v7 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v6 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB1_3 @@ -867,8 +867,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB6_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s7, 63, s8 @@ -897,10 +897,10 @@ ; GCN-IR-NEXT: s_and_b32 s4, s10, 1 ; GCN-IR-NEXT: s_and_b64 s[16:17], s[10:11], s[2:3] ; GCN-IR-NEXT: s_sub_u32 s12, s12, s16 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_subb_u32 s13, s13, s17 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -1063,8 +1063,8 @@ ; GCN-IR-NEXT: s_cbranch_vccz BB7_5 ; GCN-IR-NEXT: ; %bb.1: ; %udiv-bb1 ; GCN-IR-NEXT: s_add_u32 s10, s8, 1 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: s_addc_u32 s11, s9, 0 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s8 ; GCN-IR-NEXT: v_mov_b32_e32 v1, s9 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[10:11], v[0:1] ; GCN-IR-NEXT: s_sub_i32 s7, 63, s8 @@ -1090,10 +1090,10 @@ ; GCN-IR-NEXT: s_and_b32 s4, s10, 1 ; GCN-IR-NEXT: s_and_b32 s10, s10, 24 ; GCN-IR-NEXT: s_sub_u32 s12, s12, s10 -; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_subb_u32 s13, s13, 0 -; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 +; GCN-IR-NEXT: v_mov_b32_e32 v0, s6 ; GCN-IR-NEXT: s_add_u32 s6, s6, 1 +; GCN-IR-NEXT: v_mov_b32_e32 v1, s7 ; GCN-IR-NEXT: s_addc_u32 s7, s7, 0 ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, s[6:7], v[0:1] ; GCN-IR-NEXT: s_mov_b64 s[10:11], s[4:5] @@ -1273,8 +1273,8 @@ ; GCN-IR-NEXT: v_addc_u32_e32 v13, vcc, -1, v1, vcc ; GCN-IR-NEXT: v_lshr_b64 v[8:9], s[4:5], v8 ; GCN-IR-NEXT: v_sub_i32_e32 v6, vcc, 47, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_subb_u32_e32 v7, vcc, 0, v7, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: BB8_3: ; %udiv-do-while @@ -1294,12 +1294,12 @@ ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v15 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v14, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB8_3 @@ -1367,8 +1367,8 @@ ; GCN-IR-NEXT: ; %bb.2: ; %udiv-preheader ; GCN-IR-NEXT: v_lshr_b64 v[8:9], v[0:1], v7 ; GCN-IR-NEXT: v_add_i32_e32 v6, vcc, 0xffffffcf, v6 -; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_addc_u32_e64 v7, s[4:5], 0, -1, vcc +; GCN-IR-NEXT: v_mov_b32_e32 v10, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v11, 0 ; GCN-IR-NEXT: v_mov_b32_e32 v5, 0 ; GCN-IR-NEXT: s_movk_i32 s12, 0x7fff @@ -1388,13 +1388,13 @@ ; GCN-IR-NEXT: v_or_b32_e32 v3, v11, v3 ; GCN-IR-NEXT: v_addc_u32_e32 v11, vcc, 0, v7, vcc ; GCN-IR-NEXT: v_cmp_lt_u64_e32 vcc, v[10:11], v[6:7] -; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 ; GCN-IR-NEXT: v_mov_b32_e32 v12, 0 ; GCN-IR-NEXT: v_sub_i32_e64 v8, s[4:5], v8, v13 -; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 -; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_subb_u32_e64 v9, s[4:5], v9, v12, s[4:5] ; GCN-IR-NEXT: s_or_b64 s[10:11], vcc, s[10:11] +; GCN-IR-NEXT: v_mov_b32_e32 v6, v10 +; GCN-IR-NEXT: v_mov_b32_e32 v7, v11 +; GCN-IR-NEXT: v_mov_b32_e32 v11, v5 ; GCN-IR-NEXT: v_mov_b32_e32 v10, v4 ; GCN-IR-NEXT: s_andn2_b64 exec, exec, s[10:11] ; GCN-IR-NEXT: s_cbranch_execnz BB9_3 diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir --- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir +++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expand_and_shrink.mir @@ -2,8 +2,8 @@ --- # GCN-LABEL: name: expand_imm64_sext_shrink_to_bfrev -# GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec, implicit-def $vgpr0_vgpr1 -# GCN: $vgpr1 = V_BFREV_B32_e32 1, implicit $exec, implicit-def $vgpr0_vgpr1 +# GCN: $vgpr0 = V_MOV_B32_e32 0, implicit $exec +# GCN: $vgpr1 = V_BFREV_B32_e32 1, implicit $exec name: expand_imm64_sext_shrink_to_bfrev tracksRegLiveness: true body: | diff --git a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir --- a/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir +++ b/llvm/test/CodeGen/AMDGPU/v_mov_b64_expansion.mir @@ -2,8 +2,8 @@ # RUN: llc -march=amdgcn -mcpu=gfx90a -run-pass postrapseudos -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GCN,GFX90A %s # GCN-LABEL: name: v_mov_b64_from_vgpr -# GFX900: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec, implicit-def $vgpr0_vgpr1 -# GFX900: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX900: $vgpr0 = V_MOV_B32_e32 $vgpr2, implicit $exec +# GFX900: $vgpr1 = V_MOV_B32_e32 $vgpr3, implicit $exec # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $vgpr2_vgpr3, 12, $vgpr2_vgpr3, 0, 0, 0, 0, 0, implicit $exec name: v_mov_b64_from_vgpr body: | @@ -12,8 +12,8 @@ ... # GCN-LABEL: name: v_mov_b64_from_sgpr -# GFX900: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec, implicit-def $vgpr0_vgpr1 -# GFX900: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX900: $vgpr0 = V_MOV_B32_e32 $sgpr2, implicit $exec +# GFX900: $vgpr1 = V_MOV_B32_e32 $sgpr3, implicit $exec # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, $sgpr2_sgpr3, 12, $sgpr2_sgpr3, 0, 0, 0, 0, 0, implicit $exec name: v_mov_b64_from_sgpr body: | @@ -22,10 +22,10 @@ ... # GCN-LABEL: name: v_mov_b64_from_sext_inline_imm -# GFX900: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec, implicit-def $vgpr0_vgpr1 -# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1 -# GFX90A: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec, implicit-def $vgpr0_vgpr1 -# GFX90A: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX900: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec +# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec +# GFX90A: $vgpr0 = V_MOV_B32_e32 -2, implicit $exec +# GFX90A: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec name: v_mov_b64_from_sext_inline_imm body: | bb.0: @@ -33,8 +33,8 @@ ... # GCN-LABEL: name: v_mov_b64_from_lit -# GCN: $vgpr0 = V_MOV_B32_e32 1430494974, implicit $exec, implicit-def $vgpr0_vgpr1 -# GCN: $vgpr1 = V_MOV_B32_e32 -232831, implicit $exec, implicit-def $vgpr0_vgpr1 +# GCN: $vgpr0 = V_MOV_B32_e32 1430494974, implicit $exec +# GCN: $vgpr1 = V_MOV_B32_e32 -232831, implicit $exec name: v_mov_b64_from_lit body: | bb.0: @@ -42,8 +42,8 @@ ... # GCN-LABEL: name: v_mov_b64_from_first_inline_imm -# GCN: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1 -# GCN: $vgpr1 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr0_vgpr1 +# GCN: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec +# GCN: $vgpr1 = V_MOV_B32_e32 268435455, implicit $exec name: v_mov_b64_from_first_inline_imm body: | bb.0: @@ -51,8 +51,8 @@ ... # GCN-LABEL: name: v_mov_b64_from_second_inline_imm -# GCN: $vgpr0 = V_MOV_B32_e32 268435455, implicit $exec, implicit-def $vgpr0_vgpr1 -# GCN: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1 +# GCN: $vgpr0 = V_MOV_B32_e32 268435455, implicit $exec +# GCN: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec name: v_mov_b64_from_second_inline_imm body: | bb.0: @@ -60,8 +60,8 @@ ... # GCN-LABEL: name: v_mov_b64_from_same_sext_inline_imm -# GFX900: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1 -# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX900: $vgpr0 = V_MOV_B32_e32 -1, implicit $exec +# GFX900: $vgpr1 = V_MOV_B32_e32 -1, implicit $exec # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, -1, 8, -1, 0, 0, 0, 0, 0, implicit $exec name: v_mov_b64_from_same_sext_inline_imm body: | @@ -70,8 +70,8 @@ ... # GCN-LABEL: name: v_mov_b64_from_same_fp_inline_imm -# GFX900: $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0_vgpr1 -# GFX900: $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec, implicit-def $vgpr0_vgpr1 +# GFX900: $vgpr0 = V_MOV_B32_e32 1065353216, implicit $exec +# GFX900: $vgpr1 = V_MOV_B32_e32 1065353216, implicit $exec # GFX90A: $vgpr0_vgpr1 = V_PK_MOV_B32 8, 1065353216, 8, 1065353216, 0, 0, 0, 0, 0, implicit $exec name: v_mov_b64_from_same_fp_inline_imm body: | diff --git a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll --- a/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll +++ b/llvm/test/CodeGen/AMDGPU/widen-smrd-loads.ll @@ -282,10 +282,10 @@ ; VI-NEXT: v_add_u32_e32 v0, vcc, s0, v0 ; VI-NEXT: v_addc_u32_e32 v1, vcc, 0, v1, vcc ; VI-NEXT: flat_load_ushort v0, v[0:1] +; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: s_waitcnt vmcnt(0) ; VI-NEXT: v_add_u16_e32 v2, 0x3e7, v0 ; VI-NEXT: v_mov_b32_e32 v0, 0 -; VI-NEXT: v_mov_b32_e32 v1, 0 ; VI-NEXT: v_or_b32_e32 v2, 4, v2 ; VI-NEXT: flat_store_short v[0:1], v2 ; VI-NEXT: s_endpgm @@ -386,8 +386,8 @@ ; SI-NEXT: s_and_b32 s1, s1, 1 ; SI-NEXT: s_add_u32 s4, s1, 0x3e7 ; SI-NEXT: s_addc_u32 s5, 0, 0 -; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: s_mov_b32 s1, s0 +; SI-NEXT: v_mov_b32_e32 v0, s4 ; SI-NEXT: v_mov_b32_e32 v1, s5 ; SI-NEXT: buffer_store_dwordx2 v[0:1], off, s[0:3], 0 ; SI-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -808,8 +808,8 @@ ; GFX9-O3-NEXT: v_mov_b32_e32 v6, s9 ; GFX9-O3-NEXT: s_not_b64 exec, exec ; GFX9-O3-NEXT: v_mov_b32_e32 v7, v1 -; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v8, v2 +; GFX9-O3-NEXT: v_mov_b32_e32 v9, v3 ; GFX9-O3-NEXT: v_mov_b32_e32 v10, v4 ; GFX9-O3-NEXT: v_mov_b32_e32 v11, v5 ; GFX9-O3-NEXT: v_mov_b32_e32 v12, v6