diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -317,6 +317,9 @@ void initializeSIPostRABundlerPass(PassRegistry&); extern char &SIPostRABundlerID; +void initializeGCNCreateVOPDPass(PassRegistry &); +extern char &GCNCreateVOPDID; + void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); extern char &AMDGPUUnifyDivergentExitNodesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -22,11 +22,13 @@ #include "AMDGPUTargetTransformInfo.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" +#include "GCNVOPDUtils.h" #include "R600.h" #include "R600TargetMachine.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" @@ -278,6 +280,12 @@ cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden); +// Enable GFX11+ VOPD +static cl::opt + EnableVOPD("amdgpu-enable-vopd", + cl::desc("Enable VOPD, dual issue of VALU in wave32"), + cl::init(true), cl::Hidden); + // Option is used in lit tests to prevent deadcoding of patterns inspected. static cl::opt EnableDCEInRA("amdgpu-dce-in-ra", @@ -383,6 +391,7 @@ initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); initializeSIPostRABundlerPass(*PR); + initializeGCNCreateVOPDPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); @@ -920,6 +929,8 @@ DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); DAG->addMutation(createIGroupLPDAGMutation()); DAG->addMutation(createSchedBarrierDAGMutation()); + if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + DAG->addMutation(createVOPDPairingMutation()); return DAG; } @@ -1399,6 +1410,8 @@ } void GCNPassConfig::addPreEmitPass() { + if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + addPass(&GCNCreateVOPDID); addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -100,6 +100,7 @@ AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp R600MachineCFGStructurizer.cpp + GCNCreateVOPD.cpp GCNDPPCombine.cpp GCNHazardRecognizer.cpp GCNILPSched.cpp @@ -109,6 +110,7 @@ GCNPreRAOptimizations.cpp GCNRegPressure.cpp GCNSchedStrategy.cpp + GCNVOPDUtils.cpp R600AsmPrinter.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -0,0 +1,181 @@ +//===- GCNCreateVOPD.cpp - Create VOPD Instructions ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Combine VALU pairs into VOPD instructions +/// Only works on wave32 +/// Has register requirements, we reject creating VOPD if the requirements are +/// not met. +/// shouldCombineVOPD mutator in postRA machine scheduler puts candidate +/// instructions for VOPD back-to-back +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "GCNVOPDUtils.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include + +#define DEBUG_TYPE "gcn-create-vopd" +STATISTIC(NumVOPDCreated, "Number of VOPD Insts Created."); + +using namespace llvm; + +namespace { + +class GCNCreateVOPD : public MachineFunctionPass { +private: +public: + static char ID; + const GCNSubtarget *ST = nullptr; + + GCNCreateVOPD() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "GCN Create VOPD Instructions"; + } + + bool doReplace(const SIInstrInfo *SII, + std::pair &Pair) { + auto *FirstMI = Pair.first; + auto *SecondMI = Pair.second; + unsigned Opc1 = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1), + AMDGPU::getVOPDOpcode(Opc2)); + assert(NewOpcode != -1 && + "Should have previously determined this as a possible VOPD\n"); + + auto VOPDInst = BuildMI(*FirstMI->getParent(), FirstMI, + FirstMI->getDebugLoc(), SII->get(NewOpcode)) + .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags()); + VOPDInst.add(FirstMI->getOperand(0)) + .add(SecondMI->getOperand(0)) + .add(FirstMI->getOperand(1)); + + switch (Opc1) { + case AMDGPU::V_MOV_B32_e32: + break; + case AMDGPU::V_FMAMK_F32: + VOPDInst.add(FirstMI->getOperand(2)); + VOPDInst.add(FirstMI->getOperand(3)); + break; + case AMDGPU::V_FMAAK_F32: + VOPDInst.add(FirstMI->getOperand(2)); + VOPDInst.add(FirstMI->getOperand(3)); + break; + default: + VOPDInst.add(FirstMI->getOperand(2)); + break; + } + + VOPDInst.add(SecondMI->getOperand(1)); + + switch (Opc2) { + case AMDGPU::V_MOV_B32_e32: + break; + case AMDGPU::V_FMAMK_F32: + VOPDInst.add(SecondMI->getOperand(2)); + VOPDInst.add(SecondMI->getOperand(3)); + break; + case AMDGPU::V_FMAAK_F32: + VOPDInst.add(SecondMI->getOperand(2)); + VOPDInst.add(SecondMI->getOperand(3)); + break; + default: + VOPDInst.add(SecondMI->getOperand(2)); + break; + } + + VOPDInst.copyImplicitOps(*FirstMI); + VOPDInst.copyImplicitOps(*SecondMI); + + LLVM_DEBUG(dbgs() << "VOPD Fused: " << *VOPDInst << " from\tX: " + << *Pair.first << "\tY: " << *Pair.second << "\n"); + FirstMI->eraseFromParent(); + SecondMI->eraseFromParent(); + ++NumVOPDCreated; + return true; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + ST = &MF.getSubtarget(); + if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32()) + return false; + LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n"); + + const SIInstrInfo *SII = ST->getInstrInfo(); + bool Changed = false; + + SmallVector> ReplaceCandidates; + + for (auto &MBB : MF) { + auto MII = MBB.begin(), E = MBB.end(); + while (MII != E) { + auto *FirstMI = &*MII; + MII = next_nodbg(MII, MBB.end()); + if (MII == MBB.end()) + break; + if (FirstMI->isDebugInstr()) + continue; + auto *SecondMI = &*MII; + unsigned Opc = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + llvm::AMDGPU::CanBeVOPD FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + llvm::AMDGPU::CanBeVOPD SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + std::pair Pair; + + if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) + Pair = {FirstMI, SecondMI}; + else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X) + Pair = {SecondMI, FirstMI}; + else + continue; + // checkVOPDRegConstraints cares about program order, but doReplace + // cares about X-Y order in the constituted VOPD + if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) { + ReplaceCandidates.push_back(Pair); + ++MII; + } + } + } + for (auto &Pair : ReplaceCandidates) { + Changed |= doReplace(SII, Pair); + } + + return Changed; + } +}; + +} // namespace + +char GCNCreateVOPD::ID = 0; + +char &llvm::GCNCreateVOPDID = GCNCreateVOPD::ID; + +INITIALIZE_PASS(GCNCreateVOPD, DEBUG_TYPE, "GCN Create VOPD Instructions", + false, false) diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h @@ -0,0 +1,32 @@ +//===- GCNVOPDUtils.h - GCN VOPD Utils ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU DAG scheduling +/// mutation to pair VOPD instructions back to back. It also contains +// subroutines useful in the creation of VOPD instructions +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class SIInstrInfo; + +bool checkVOPDRegConstraints(const SIInstrInfo &TII, + const MachineInstr &FirstMI, + const MachineInstr &SecondMI); + +std::unique_ptr createVOPDPairingMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -0,0 +1,212 @@ +//===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU DAG scheduling +/// mutation to pair VOPD instructions back to back. It also contains +// subroutines useful in the creation of VOPD instructions +// +//===----------------------------------------------------------------------===// + +#include "GCNVOPDUtils.h" +#include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/MC/MCInst.h" + +using namespace llvm; + +#define DEBUG_TYPE "gcn-vopd-utils" + +bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, + const MachineInstr &FirstMI, + const MachineInstr &SecondMI) { + const MachineFunction *MF = FirstMI.getMF(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = dyn_cast(ST.getRegisterInfo()); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const unsigned NumVGPRBanks = 4; + // Literals also count against scalar bus limit + SmallVector UniqueLiterals; + auto addLiteral = [&](const MachineOperand &Op) { + for (auto &Literal : UniqueLiterals) { + if (Literal->isIdenticalTo(Op)) + return; + } + UniqueLiterals.push_back(&Op); + }; + SmallVector UniqueScalarRegs; + assert([&]() -> bool { + for (auto MII = MachineBasicBlock::const_iterator(&FirstMI); + MII != FirstMI.getParent()->instr_end(); ++MII) { + if (&*MII == &SecondMI) + return true; + } + return false; + }() && "Expected FirstMI to precede SecondMI"); + // Cannot pair dependent instructions + for (const auto &Use : SecondMI.uses()) + if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg())) + return false; + + struct ComponentInfo { + ComponentInfo(const MachineInstr &MI) : MI(MI) {} + Register Dst, Reg0, Reg1, Reg2; + const MachineInstr &MI; + }; + ComponentInfo CInfo[] = {ComponentInfo(FirstMI), ComponentInfo(SecondMI)}; + + for (ComponentInfo &Comp : CInfo) { + switch (Comp.MI.getOpcode()) { + case AMDGPU::V_FMAMK_F32: + // cannot inline the fixed literal in fmamk + addLiteral(Comp.MI.getOperand(2)); + Comp.Reg2 = Comp.MI.getOperand(3).getReg(); + break; + case AMDGPU::V_FMAAK_F32: + // cannot inline the fixed literal in fmaak + addLiteral(Comp.MI.getOperand(3)); + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_DOT2_F32_F16: + case AMDGPU::V_DOT2_F32_BF16: + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + Comp.Reg2 = Comp.MI.getOperand(0).getReg(); + break; + case AMDGPU::V_CNDMASK_B32_e32: + UniqueScalarRegs.push_back(AMDGPU::VCC_LO); + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + case AMDGPU::V_MOV_B32_e32: + break; + default: + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + } + + Comp.Dst = Comp.MI.getOperand(0).getReg(); + + const MachineOperand &Op0 = Comp.MI.getOperand(1); + if (Op0.isReg()) { + if (!TRI->isVectorRegister(MRI, Op0.getReg())) { + if (!is_contained(UniqueScalarRegs, Op0.getReg())) + UniqueScalarRegs.push_back(Op0.getReg()); + } else + Comp.Reg0 = Op0.getReg(); + } else { + if (!TII.isInlineConstant(Comp.MI, 1)) + addLiteral(Op0); + } + } + + if (UniqueLiterals.size() > 1) + return false; + if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2) + return false; + + // check port 0 + if (CInfo[0].Reg0 && CInfo[1].Reg0 && + CInfo[0].Reg0 % NumVGPRBanks == CInfo[1].Reg0 % NumVGPRBanks) + return false; + // check port 1 + if (CInfo[0].Reg1 && CInfo[1].Reg1 && + CInfo[0].Reg1 % NumVGPRBanks == CInfo[1].Reg1 % NumVGPRBanks) + return false; + // check port 2 + if (CInfo[0].Reg2 && CInfo[1].Reg2 && + !((CInfo[0].Reg2 ^ CInfo[1].Reg2) & 0x1)) + return false; + if (!((CInfo[0].Dst ^ CInfo[1].Dst) & 0x1)) + return false; + + LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: " << FirstMI + << "\n\tY: " << SecondMI << "\n"); + return true; +} + +/// Check if the instr pair, FirstMI and SecondMI, should be scheduled +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const SIInstrInfo &STII = static_cast(TII); + unsigned Opc2 = SecondMI.getOpcode(); + auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + + // One instruction case + if (!FirstMI) + return SecondCanBeVOPD.Y; + + unsigned Opc = FirstMI->getOpcode(); + auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + + if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) || + (FirstCanBeVOPD.Y && SecondCanBeVOPD.X))) + return false; + + return checkVOPDRegConstraints(STII, *FirstMI, SecondMI); +} + +/// Adapts design from MacroFusion +/// Puts valid candidate instructions back-to-back so they can easily +/// be turned into VOPD instructions +/// Greedily pairs instruction candidates. O(n^2) algorithm. +struct VOPDPairingMutation : ScheduleDAGMutation { + ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer + + VOPDPairingMutation( + ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer + : shouldScheduleAdjacent(shouldScheduleAdjacent) {} + + void apply(ScheduleDAGInstrs *DAG) override { + const TargetInstrInfo &TII = *DAG->TII; + const GCNSubtarget &ST = DAG->MF.getSubtarget(); + if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) { + LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n"); + return; + } + + std::vector::iterator ISUI, JSUI; + for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) { + const MachineInstr *IMI = ISUI->getInstr(); + if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI)) + continue; + if (!hasLessThanNumFused(*ISUI, 2)) + continue; + + for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) { + if (JSUI->isBoundaryNode()) + continue; + const MachineInstr *JMI = JSUI->getInstr(); + if (!hasLessThanNumFused(*JSUI, 2) || + !shouldScheduleAdjacent(TII, ST, IMI, *JMI)) + continue; + if (fuseInstructionPair(*DAG, *ISUI, *JSUI)) + break; + } + } + LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n"); + } +}; + +std::unique_ptr llvm::createVOPDPairingMutation() { + return std::make_unique(shouldScheduleVOPDAdjacent); +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3982,6 +3982,14 @@ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + int Src3Idx = -1; + if (Src0Idx == -1) { + // VOPD V_DUAL_* instructions use different operand names. + Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X); + Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X); + Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y); + Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y); + } // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); @@ -4255,9 +4263,9 @@ // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. - for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { + for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) { if (OpIdx == -1) - break; + continue; const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2950,6 +2950,27 @@ let ValueCols = [["1"]]; } +def VOPDComponentTable : GenericTable { + let FilterClass = "VOPD_Component"; + let CppTypeName = "VOPDComponentInfo"; + let Fields = ["BaseVOP", "VOPDOp", "CanBeVOPDX"]; + let PrimaryKey = ["BaseVOP"]; + let PrimaryKeyName = "getVOPDComponentHelper"; +} + +def VOPDPairs : GenericTable { + let FilterClass = "VOPD_Base"; + let CppTypeName = "VOPDInfo"; + let Fields = ["Opcode", "OpX", "OpY"]; + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getVOPDOpcodeHelper"; +} + +def getVOPDInfoFromComponentOpcodes : SearchIndex { + let Table = VOPDPairs; + let Key = ["OpX", "OpY"]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -470,6 +470,14 @@ LLVM_READONLY bool getMAIIsGFX940XDL(unsigned Opc); +struct CanBeVOPD { + bool X; + bool Y; +}; + +LLVM_READONLY +CanBeVOPD getCanBeVOPD(unsigned Opc); + LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, @@ -482,6 +490,12 @@ LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); +LLVM_READONLY +unsigned getVOPDOpcode(unsigned Opc); + +LLVM_READONLY +int getVOPDFull(unsigned OpX, unsigned OpY); + LLVM_READONLY unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -277,6 +277,18 @@ uint16_t Opcode; }; +struct VOPDComponentInfo { + uint16_t BaseVOP; + uint16_t VOPDOp; + bool CanBeVOPDX; +}; + +struct VOPDInfo { + uint16_t Opcode; + uint16_t OpX; + uint16_t OpY; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL @@ -293,6 +305,10 @@ #define GET_VOPC64DPPTable_IMPL #define GET_VOPC64DPP8Table_DECL #define GET_VOPC64DPP8Table_IMPL +#define GET_VOPDComponentTable_DECL +#define GET_VOPDComponentTable_IMPL +#define GET_VOPDPairs_DECL +#define GET_VOPDPairs_IMPL #define GET_WMMAOpcode2AddrMappingTable_DECL #define GET_WMMAOpcode2AddrMappingTable_IMPL #define GET_WMMAOpcode3AddrMappingTable_DECL @@ -398,6 +414,19 @@ return Info ? Info->is_gfx940_xdl : false; } +CanBeVOPD getCanBeVOPD(unsigned Opc) { + const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); + if (Info) + return {Info->CanBeVOPDX, 1}; + else + return {0, 0}; +} + +unsigned getVOPDOpcode(unsigned Opc) { + const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); + return Info ? Info->VOPDOp : ~0u; +} + unsigned mapWMMA2AddrTo3AddrOpcode(unsigned Opc) { const WMMAOpcodeMappingInfo *Info = getWMMAMappingInfoFrom2AddrOpcode(Opc); return Info ? Info->Opcode3Addr : ~0u; @@ -415,6 +444,11 @@ return getMCOpcodeGen(Opcode, static_cast(Gen)); } +int getVOPDFull(unsigned OpX, unsigned OpY) { + const VOPDInfo *Info = getVOPDInfoFromComponentOpcodes(OpX, OpY); + return Info ? Info->Opcode : -1; +} + namespace IsaInfo { AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; GFX6-LABEL: gather4_2d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -94,14 +94,14 @@ ; GFX11-LABEL: load_2d_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -109,10 +109,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v7 -; GFX11-NEXT: v_mov_b32_e32 v1, v8 -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mov_b32_e32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] @@ -184,14 +182,14 @@ ; GFX11-LABEL: load_2d_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -199,10 +197,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v7 -; GFX11-NEXT: v_mov_b32_e32 v1, v8 -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mov_b32_e32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -98,14 +98,13 @@ ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v8, v3 -; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; GFX11-NEXT: v_mov_b32_e32 v13, v9 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -115,10 +114,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-NEXT: v_mov_b32_e32 v1, v10 -; GFX11-NEXT: v_mov_b32_e32 v2, v11 -; GFX11-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[10:11] @@ -194,14 +191,13 @@ ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v8, v3 -; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; GFX11-NEXT: v_mov_b32_e32 v13, v9 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -211,10 +207,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-NEXT: v_mov_b32_e32 v1, v10 -; GFX11-NEXT: v_mov_b32_e32 v2, v11 -; GFX11-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -113,12 +113,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_lshl_b32 s8, s0, 16 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 -; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 ; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX11-NEXT: s_mov_b32 s1, s3 @@ -126,10 +126,8 @@ ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: v_mov_b32_e32 v1, v6 -; GFX11-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-NEXT: v_mov_b32_e32 v3, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v4, v9 ; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] @@ -211,12 +209,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_lshl_b32 s8, s0, 16 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 -; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 ; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX11-NEXT: s_mov_b32 s1, s3 @@ -224,10 +222,8 @@ ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: v_mov_b32_e32 v1, v6 -; GFX11-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-NEXT: v_mov_b32_e32 v3, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v4, v9 ; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -95,16 +95,15 @@ ; ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -112,10 +111,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[10:11] @@ -188,16 +185,15 @@ ; ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -205,10 +201,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll @@ -622,8 +622,7 @@ ; ; GFX11-LABEL: image_store_f32_dmask_1111: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v1, s10 -; GFX11-NEXT: v_mov_b32_e32 v2, s11 +; GFX11-NEXT: v_dual_mov_b32 v1, s10 :: v_dual_mov_b32 v2, s11 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -42,9 +42,8 @@ ; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15 ; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15 ; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 ; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -228,10 +228,8 @@ ; ; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v18, v0 -; GFX11-NEXT: v_mov_b32_e32 v19, v1 -; GFX11-NEXT: v_mov_b32_e32 v15, v2 -; GFX11-NEXT: v_mov_b32_e32 v16, v3 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3 ; GFX11-NEXT: v_mov_b32_e32 v17, v4 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 @@ -346,19 +344,18 @@ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v13, v0 +; GFX11-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX11-NEXT: v_mov_b32_e32 v14, v1 -; GFX11-NEXT: v_mov_b32_e32 v15, v2 -; GFX11-NEXT: v_mov_b32_e32 v16, v3 +; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3 +; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_lshlrev_b32 v2, 16, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_mov_b32_e32 v17, v4 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: v_and_or_b32 v4, 0xffff, v7, v2 ; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v8, v3 ; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 @@ -470,12 +467,9 @@ ; ; GFX11-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v19, v0 -; GFX11-NEXT: v_mov_b32_e32 v20, v1 -; GFX11-NEXT: v_mov_b32_e32 v21, v2 -; GFX11-NEXT: v_mov_b32_e32 v16, v3 -; GFX11-NEXT: v_mov_b32_e32 v17, v4 -; GFX11-NEXT: v_mov_b32_e32 v18, v5 +; GFX11-NEXT: v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v1 +; GFX11-NEXT: v_dual_mov_b32 v21, v2 :: v_dual_mov_b32 v16, v3 +; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v5 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v12 @@ -595,18 +589,17 @@ ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v14, v0 +; GFX11-NEXT: v_dual_mov_b32 v14, v0 :: v_dual_mov_b32 v15, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: v_mov_b32_e32 v15, v1 -; GFX11-NEXT: v_mov_b32_e32 v16, v2 -; GFX11-NEXT: v_mov_b32_e32 v17, v3 +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v17, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX11-NEXT: v_mov_b32_e32 v18, v4 +; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_lshlrev_b32 v3, 16, v7 ; GFX11-NEXT: v_mov_b32_e32 v19, v5 ; GFX11-NEXT: v_and_or_b32 v4, 0xffff, v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v1, v0 ; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v9, v3 ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -710,35 +703,31 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_mov_b32 s8, 2.0 ; GFX11-NEXT: s_mov_b32 s9, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: s_mov_b32 s8, 2.0 ; GFX11-NEXT: s_mov_b32 s11, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s10, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 -; GFX11-NEXT: v_mov_b32_e32 v8, s14 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v3, s9 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-NEXT: v_mov_b32_e32 v2, s8 -; GFX11-NEXT: v_mov_b32_e32 v4, s10 -; GFX11-NEXT: v_mov_b32_e32 v5, s11 +; GFX11-NEXT: s_mov_b32 s7, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s9 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8 +; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v4, s10 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[0:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -865,29 +854,26 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX11-NEXT: s_mov_b32 s9, 0x42004600 ; GFX11-NEXT: s_mov_b32 s8, 2.0 +; GFX11-NEXT: s_mov_b32 s9, 0x42004600 ; GFX11-NEXT: s_mov_b32 s10, 0x44004700 ; GFX11-NEXT: s_mov_b32 s11, 0x45004800 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v3, s9 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-NEXT: v_mov_b32_e32 v2, s8 -; GFX11-NEXT: v_mov_b32_e32 v4, s10 -; GFX11-NEXT: v_mov_b32_e32 v5, s11 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s9 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8 +; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v4, s10 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -978,34 +964,29 @@ ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_mov_b32 s8, 2.0 +; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_mov_b32 s9, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: s_mov_b32 s8, 2.0 -; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: s_mov_b32 s11, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s10, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 -; GFX11-NEXT: v_mov_b32_e32 v3, s9 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_mov_b32_e32 v4, s10 -; GFX11-NEXT: v_mov_b32_e32 v5, s11 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 -; GFX11-NEXT: v_mov_b32_e32 v8, s14 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s9 +; GFX11-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0xb36211c7 ; GFX11-NEXT: s_movk_i32 s5, 0x102 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v9, s4 ; GFX11-NEXT: flat_load_b32 v11, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] @@ -1131,11 +1112,9 @@ ; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: s_mov_b32 s10, 0x44004700 ; GFX11-NEXT: s_mov_b32 s11, 0x45004800 -; GFX11-NEXT: v_mov_b32_e32 v3, s9 -; GFX11-NEXT: v_mov_b32_e32 v4, s10 -; GFX11-NEXT: v_mov_b32_e32 v5, s11 +; GFX11-NEXT: v_dual_mov_b32 v3, s9 :: v_dual_mov_b32 v4, s10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0xb36211c6 ; GFX11-NEXT: s_movk_i32 s5, 0x102 @@ -1145,8 +1124,7 @@ ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/saddsat.ll @@ -4277,8 +4277,7 @@ ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v6, s1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -4354,24 +4353,42 @@ ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_saddsat_i64: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_add_u32 s4, s0, s2 -; GFX10PLUS-NEXT: s_addc_u32 s5, s1, s3 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s5, 31 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s4 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s5 -; GFX10PLUS-NEXT: s_xor_b32 s2, s2, s1 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0x80000000 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_saddsat_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s4, s0, s2 +; GFX10-NEXT: s_addc_u32 s5, s1, s3 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_xor_b32 s2, s2, s1 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_saddsat_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_add_u32 s4, s0, s2 +; GFX11-NEXT: s_addc_u32 s5, s1, s3 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[2:3], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_ashr_i32 s0, s5, 31 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_xor_b32 s2, s2, s1 +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } @@ -4425,19 +4442,32 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: saddsat_i64_sv: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10PLUS-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] -; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 -; GFX10PLUS-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: saddsat_i64_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: saddsat_i64_sv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, s0, v0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] +; GFX11-NEXT: v_cmp_gt_i64_e64 s0, 0, v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 +; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -4492,19 +4522,32 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: saddsat_i64_vs: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[0:1], 0 -; GFX10PLUS-NEXT: s_mov_b32 s0, 0 -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10PLUS-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 -; GFX10PLUS-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: saddsat_i64_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 +; GFX10-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[0:1], 0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 +; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: saddsat_i64_vs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v0, s0 +; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[0:1], 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 +; GFX11-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.sadd.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -4629,11 +4672,9 @@ ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, v[10:11], v[2:3] ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s1, 0x80000000, v4, s1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s3, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -4766,38 +4807,69 @@ ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_saddsat_v2i64: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_add_u32 s8, s0, s4 -; GFX10PLUS-NEXT: s_addc_u32 s9, s1, s5 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX10PLUS-NEXT: s_mov_b32 s10, 0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s9, 31 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s8 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s9 -; GFX10PLUS-NEXT: s_xor_b32 s8, s4, s1 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0x80000000 -; GFX10PLUS-NEXT: s_add_u32 s4, s2, s6 -; GFX10PLUS-NEXT: s_addc_u32 s5, s3, s7 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, s4 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s5, 31 -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, s5 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 -; GFX10PLUS-NEXT: s_xor_b32 s2, s3, s2 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0x80000000 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_saddsat_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s8, s0, s4 +; GFX10-NEXT: s_addc_u32 s9, s1, s5 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] +; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: s_ashr_i32 s0, s9, 31 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: s_xor_b32 s8, s4, s1 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_add_u32 s4, s2, s6 +; GFX10-NEXT: s_addc_u32 s5, s3, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 +; GFX10-NEXT: s_xor_b32 s2, s3, s2 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_saddsat_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_add_u32 s8, s0, s4 +; GFX11-NEXT: s_addc_u32 s9, s1, s5 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[4:5], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] +; GFX11-NEXT: s_mov_b32 s10, 0 +; GFX11-NEXT: s_ashr_i32 s0, s9, 31 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: s_xor_b32 s8, s4, s1 +; GFX11-NEXT: s_cmp_lg_u32 s10, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 +; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_add_u32 s4, s2, s6 +; GFX11-NEXT: s_addc_u32 s5, s3, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s3, s[6:7], 0 +; GFX11-NEXT: s_ashr_i32 s0, s5, 31 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 +; GFX11-NEXT: s_xor_b32 s2, s3, s2 +; GFX11-NEXT: s_cmp_lg_u32 s10, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i64> @llvm.sadd.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result } @@ -5871,91 +5943,176 @@ ; GFX9-NEXT: v_readfirstlane_b32 s7, v3 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_saddsat_v2i128: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_add_u32 s8, s0, s8 -; GFX10PLUS-NEXT: s_addc_u32 s9, s1, s9 -; GFX10PLUS-NEXT: s_addc_u32 s16, s2, s10 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] -; GFX10PLUS-NEXT: s_addc_u32 s17, s3, s11 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] -; GFX10PLUS-NEXT: v_mov_b32_e32 v5, s17 -; GFX10PLUS-NEXT: s_cselect_b32 s18, 1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10PLUS-NEXT: s_and_b32 s0, 1, s18 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10PLUS-NEXT: s_and_b32 s1, 1, s1 -; GFX10PLUS-NEXT: s_ashr_i32 s2, s17, 31 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 -; GFX10PLUS-NEXT: s_mov_b32 s0, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s0, 0 -; GFX10PLUS-NEXT: s_addc_u32 s1, s2, 0 -; GFX10PLUS-NEXT: s_addc_u32 s10, s2, 0 -; GFX10PLUS-NEXT: s_addc_u32 s3, s2, 0x80000000 -; GFX10PLUS-NEXT: s_add_u32 s12, s4, s12 -; GFX10PLUS-NEXT: s_addc_u32 s13, s5, s13 -; GFX10PLUS-NEXT: s_addc_u32 s18, s6, s14 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[12:13], s[4:5] -; GFX10PLUS-NEXT: s_addc_u32 s19, s7, s15 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s5, s[14:15], 0 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[18:19], s[6:7] -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s4, s[18:19], s[6:7] -; GFX10PLUS-NEXT: s_and_b32 s0, 1, s0 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX10PLUS-NEXT: s_cselect_b32 s4, 1, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s8 -; GFX10PLUS-NEXT: s_and_b32 s4, 1, s4 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v3, v2, s0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, s4 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s9 -; GFX10PLUS-NEXT: v_mov_b32_e32 v6, s13 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v7, s19 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v4, s16 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, s1, vcc_lo -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s19, 31 -; GFX10PLUS-NEXT: v_xor_b32_e32 v2, v3, v2 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v4, s10, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v5, s12 -; GFX10PLUS-NEXT: v_and_b32_e32 v2, 1, v2 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0 -; GFX10PLUS-NEXT: s_addc_u32 s2, s0, 0 -; GFX10PLUS-NEXT: s_addc_u32 s3, s0, 0x80000000 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, s18 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v4 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v5 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v6 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_saddsat_v2i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s8, s0, s8 +; GFX10-NEXT: s_addc_u32 s9, s1, s9 +; GFX10-NEXT: s_addc_u32 s16, s2, s10 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] +; GFX10-NEXT: s_addc_u32 s17, s3, s11 +; GFX10-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] +; GFX10-NEXT: v_mov_b32_e32 v5, s17 +; GFX10-NEXT: s_cselect_b32 s18, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, 1, s18 +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: s_ashr_i32 s2, s17, 31 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: s_cmp_lg_u32 s0, 0 +; GFX10-NEXT: s_addc_u32 s1, s2, 0 +; GFX10-NEXT: s_addc_u32 s10, s2, 0 +; GFX10-NEXT: s_addc_u32 s3, s2, 0x80000000 +; GFX10-NEXT: s_add_u32 s12, s4, s12 +; GFX10-NEXT: s_addc_u32 s13, s5, s13 +; GFX10-NEXT: s_addc_u32 s18, s6, s14 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[12:13], s[4:5] +; GFX10-NEXT: s_addc_u32 s19, s7, s15 +; GFX10-NEXT: v_cmp_lt_i64_e64 s5, s[14:15], 0 +; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[6:7] +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[18:19], s[6:7] +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX10-NEXT: s_cselect_b32 s4, 1, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_and_b32 s4, 1, s4 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v3, v2, s0 +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, s4 +; GFX10-NEXT: v_mov_b32_e32 v0, s9 +; GFX10-NEXT: v_mov_b32_e32 v6, s13 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v7, s19 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 +; GFX10-NEXT: v_mov_b32_e32 v4, s16 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s1, vcc_lo +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_ashr_i32 s0, s19, 31 +; GFX10-NEXT: v_xor_b32_e32 v2, v3, v2 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v4, s10, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s12 +; GFX10-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX10-NEXT: s_addc_u32 s1, s0, 0 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX10-NEXT: v_mov_b32_e32 v2, s18 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v0 +; GFX10-NEXT: v_readfirstlane_b32 s2, v3 +; GFX10-NEXT: v_readfirstlane_b32 s3, v4 +; GFX10-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-NEXT: v_readfirstlane_b32 s5, v6 +; GFX10-NEXT: v_readfirstlane_b32 s6, v2 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_saddsat_v2i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_add_u32 s8, s0, s8 +; GFX11-NEXT: s_addc_u32 s9, s1, s9 +; GFX11-NEXT: s_addc_u32 s16, s2, s10 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] +; GFX11-NEXT: s_addc_u32 s17, s3, s11 +; GFX11-NEXT: s_cmp_eq_u64 s[16:17], s[2:3] +; GFX11-NEXT: v_mov_b32_e32 v5, s17 +; GFX11-NEXT: s_cselect_b32 s18, 1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[16:17], s[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[10:11], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-NEXT: s_and_b32 s0, 1, s18 +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: s_ashr_i32 s2, s17, 31 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s1 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, v2, 0, s0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: s_cmp_lg_u32 s0, 0 +; GFX11-NEXT: s_addc_u32 s1, s2, 0 +; GFX11-NEXT: s_addc_u32 s10, s2, 0 +; GFX11-NEXT: s_addc_u32 s3, s2, 0x80000000 +; GFX11-NEXT: s_add_u32 s12, s4, s12 +; GFX11-NEXT: s_addc_u32 s13, s5, s13 +; GFX11-NEXT: s_addc_u32 s18, s6, s14 +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[12:13], s[4:5] +; GFX11-NEXT: s_addc_u32 s19, s7, s15 +; GFX11-NEXT: v_cmp_lt_i64_e64 s5, s[14:15], 0 +; GFX11-NEXT: s_cmp_eq_u64 s[18:19], s[6:7] +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[18:19], s[6:7] +; GFX11-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s5 +; GFX11-NEXT: s_and_b32 s0, 1, s0 +; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: s_cselect_b32 s4, 1, 0 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: s_and_b32 s4, 1, s4 +; GFX11-NEXT: v_mov_b32_e32 v6, s13 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v3, v2, s0 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, s4 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_dual_mov_b32 v0, s9 :: v_dual_mov_b32 v7, s19 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, 0, s0 +; GFX11-NEXT: v_mov_b32_e32 v4, s16 +; GFX11-NEXT: s_ashr_i32 s0, s19, 31 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s2, vcc_lo +; GFX11-NEXT: v_xor_b32_e32 v2, v3, v2 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v4, s10, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v5, s12 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s1, vcc_lo +; GFX11-NEXT: v_and_b32_e32 v2, 1, v2 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v2 +; GFX11-NEXT: v_mov_b32_e32 v2, s18 +; GFX11-NEXT: s_addc_u32 s1, s0, 0 +; GFX11-NEXT: s_addc_u32 s2, s0, 0 +; GFX11-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_readfirstlane_b32 s1, v0 +; GFX11-NEXT: v_readfirstlane_b32 s2, v3 +; GFX11-NEXT: v_readfirstlane_b32 s3, v4 +; GFX11-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-NEXT: v_readfirstlane_b32 s5, v6 +; GFX11-NEXT: v_readfirstlane_b32 s6, v2 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.sadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/ssubsat.ll @@ -4263,8 +4263,7 @@ ; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[4:5], v[0:1] ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v6, s1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v4, v6, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v5, v1, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v4, v6 :: v_dual_cndmask_b32 v1, v5, v1 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result @@ -4340,24 +4339,42 @@ ; GFX9-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_ssubsat_i64: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_sub_u32 s4, s0, s2 -; GFX10PLUS-NEXT: s_subb_u32 s5, s1, s3 -; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] -; GFX10PLUS-NEXT: s_mov_b32 s3, 0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s5, 31 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s4 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s5 -; GFX10PLUS-NEXT: s_xor_b32 s2, s2, s1 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s3, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0x80000000 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_ssubsat_i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sub_u32 s4, s0, s2 +; GFX10-NEXT: s_subb_u32 s5, s1, s3 +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] +; GFX10-NEXT: s_mov_b32 s3, 0 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: v_mov_b32_e32 v0, s4 +; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: s_xor_b32 s2, s2, s1 +; GFX10-NEXT: s_cmp_lg_u32 s3, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_ssubsat_i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_u32 s4, s0, s2 +; GFX11-NEXT: s_subb_u32 s5, s1, s3 +; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[2:3], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[4:5], s[0:1] +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_ashr_i32 s0, s5, 31 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: s_xor_b32 s2, s2, s1 +; GFX11-NEXT: s_cmp_lg_u32 s3, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s2 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) ret i64 %result } @@ -4411,19 +4428,32 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: ssubsat_i64_sv: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 -; GFX10PLUS-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10PLUS-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 -; GFX10PLUS-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 -; GFX10PLUS-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: ssubsat_i64_sv: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 +; GFX10-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX10-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 +; GFX10-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ssubsat_i64_sv: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, s0, v0 +; GFX11-NEXT: v_sub_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX11-NEXT: v_cmp_gt_i64_e32 vcc_lo, s[0:1], v[2:3] +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, 0, v[0:1] +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s1, 0x80000000, v4, s1 +; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -4478,19 +4508,32 @@ ; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: ssubsat_i64_vs: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 -; GFX10PLUS-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s1, s[0:1], 0 -; GFX10PLUS-NEXT: s_mov_b32 s0, 0 -; GFX10PLUS-NEXT: v_ashrrev_i32_e32 v4, 31, v3 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] -; GFX10PLUS-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 -; GFX10PLUS-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: ssubsat_i64_vs: +; GFX10: ; %bb.0: +; GFX10-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 +; GFX10-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX10-NEXT: v_cmp_gt_i64_e64 s1, s[0:1], 0 +; GFX10-NEXT: s_mov_b32 s0, 0 +; GFX10-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX10-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX10-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 +; GFX10-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v0, v2, v4, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v1, vcc_lo +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: ssubsat_i64_vs: +; GFX11: ; %bb.0: +; GFX11-NEXT: v_sub_co_u32 v2, vcc_lo, v0, s0 +; GFX11-NEXT: v_subrev_co_ci_u32_e32 v3, vcc_lo, s1, v1, vcc_lo +; GFX11-NEXT: v_cmp_gt_i64_e64 s1, s[0:1], 0 +; GFX11-NEXT: s_mov_b32 s0, 0 +; GFX11-NEXT: v_ashrrev_i32_e32 v4, 31, v3 +; GFX11-NEXT: v_cmp_lt_i64_e32 vcc_lo, v[2:3], v[0:1] +; GFX11-NEXT: v_add_co_ci_u32_e64 v1, s0, 0x80000000, v4, s0 +; GFX11-NEXT: s_xor_b32 vcc_lo, s1, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v2, v4 :: v_dual_cndmask_b32 v1, v3, v1 +; GFX11-NEXT: ; return to shader part epilog %result = call i64 @llvm.ssub.sat.i64(i64 %lhs, i64 %rhs) %cast = bitcast i64 %result to <2 x float> ret <2 x float> %cast @@ -4615,11 +4658,9 @@ ; GFX11-NEXT: v_cmp_lt_i64_e64 s2, v[10:11], v[2:3] ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, s1, 0x80000000, v4, s1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v8, v0, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v1, v9, v1, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v0, v8, v0 :: v_dual_cndmask_b32 v1, v9, v1 ; GFX11-NEXT: s_xor_b32 vcc_lo, s3, s2 -; GFX11-NEXT: v_cndmask_b32_e32 v2, v10, v4, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v3, v11, v3, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v2, v10, v4 :: v_dual_cndmask_b32 v3, v11, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result @@ -4752,38 +4793,69 @@ ; GFX9-NEXT: v_readfirstlane_b32 s3, v1 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_ssubsat_v2i64: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_sub_u32 s8, s0, s4 -; GFX10PLUS-NEXT: s_subb_u32 s9, s1, s5 -; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] -; GFX10PLUS-NEXT: s_mov_b32 s10, 0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s9, 31 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s8 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s9 -; GFX10PLUS-NEXT: s_xor_b32 s8, s4, s1 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0x80000000 -; GFX10PLUS-NEXT: s_sub_u32 s4, s2, s6 -; GFX10PLUS-NEXT: s_subb_u32 s5, s3, s7 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, s4 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] -; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s5, 31 -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, s5 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 -; GFX10PLUS-NEXT: s_xor_b32 s2, s3, s2 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s10, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0x80000000 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_ssubsat_v2i64: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sub_u32 s8, s0, s4 +; GFX10-NEXT: s_subb_u32 s9, s1, s5 +; GFX10-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] +; GFX10-NEXT: s_mov_b32 s10, 0 +; GFX10-NEXT: s_ashr_i32 s0, s9, 31 +; GFX10-NEXT: v_mov_b32_e32 v0, s8 +; GFX10-NEXT: v_mov_b32_e32 v1, s9 +; GFX10-NEXT: s_xor_b32 s8, s4, s1 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX10-NEXT: s_sub_u32 s4, s2, s6 +; GFX10-NEXT: s_subb_u32 s5, s3, s7 +; GFX10-NEXT: v_mov_b32_e32 v2, s4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] +; GFX10-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 +; GFX10-NEXT: s_ashr_i32 s0, s5, 31 +; GFX10-NEXT: v_mov_b32_e32 v3, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 +; GFX10-NEXT: s_xor_b32 s2, s3, s2 +; GFX10-NEXT: s_cmp_lg_u32 s10, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 +; GFX10-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_ssubsat_v2i64: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_u32 s8, s0, s4 +; GFX11-NEXT: s_subb_u32 s9, s1, s5 +; GFX11-NEXT: v_cmp_gt_i64_e64 s4, s[4:5], 0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s1, s[8:9], s[0:1] +; GFX11-NEXT: s_mov_b32 s10, 0 +; GFX11-NEXT: s_ashr_i32 s0, s9, 31 +; GFX11-NEXT: v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9 +; GFX11-NEXT: s_xor_b32 s8, s4, s1 +; GFX11-NEXT: s_cmp_lg_u32 s10, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s0, s8 +; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX11-NEXT: s_sub_u32 s4, s2, s6 +; GFX11-NEXT: s_subb_u32 s5, s3, s7 +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s5 +; GFX11-NEXT: v_cmp_lt_i64_e64 s2, s[4:5], s[2:3] +; GFX11-NEXT: v_cmp_gt_i64_e64 s3, s[6:7], 0 +; GFX11-NEXT: s_ashr_i32 s0, s5, 31 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s1, s8 +; GFX11-NEXT: s_xor_b32 s2, s3, s2 +; GFX11-NEXT: s_cmp_lg_u32 s10, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s0, s2 +; GFX11-NEXT: s_addc_u32 s1, s0, 0x80000000 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s1, s2 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i64> @llvm.ssub.sat.v2i64(<2 x i64> %lhs, <2 x i64> %rhs) ret <2 x i64> %result } @@ -4948,52 +5020,98 @@ ; GFX9-NEXT: v_readfirstlane_b32 s3, v3 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_ssubsat_i128: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_sub_u32 s8, s0, s4 -; GFX10PLUS-NEXT: s_subb_u32 s9, s1, s5 -; GFX10PLUS-NEXT: s_subb_u32 s10, s2, s6 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] -; GFX10PLUS-NEXT: s_subb_u32 s11, s3, s7 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] -; GFX10PLUS-NEXT: s_cselect_b32 s12, 1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3] -; GFX10PLUS-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10PLUS-NEXT: s_and_b32 s0, 1, s12 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[6:7], 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s11, 31 -; GFX10PLUS-NEXT: s_and_b32 s1, 1, s1 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10PLUS-NEXT: s_mov_b32 s1, 0 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, s9 -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, s11 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0 -; GFX10PLUS-NEXT: s_addc_u32 s2, s0, 0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s8 -; GFX10PLUS-NEXT: s_addc_u32 s3, s0, 0x80000000 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s10 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_ssubsat_i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sub_u32 s8, s0, s4 +; GFX10-NEXT: s_subb_u32 s9, s1, s5 +; GFX10-NEXT: s_subb_u32 s10, s2, s6 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] +; GFX10-NEXT: s_subb_u32 s11, s3, s7 +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] +; GFX10-NEXT: s_cselect_b32 s12, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3] +; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, 1, s12 +; GFX10-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: s_ashr_i32 s0, s11, 31 +; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: s_mov_b32 s1, 0 +; GFX10-NEXT: s_cmp_lg_u32 s1, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v2, s9 +; GFX10-NEXT: v_mov_b32_e32 v3, s11 +; GFX10-NEXT: s_addc_u32 s1, s0, 0 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s8 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s10 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_ssubsat_i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_u32 s8, s0, s4 +; GFX11-NEXT: s_subb_u32 s9, s1, s5 +; GFX11-NEXT: s_subb_u32 s10, s2, s6 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[8:9], s[0:1] +; GFX11-NEXT: s_subb_u32 s11, s3, s7 +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], s[2:3] +; GFX11-NEXT: s_cselect_b32 s12, 1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[10:11], s[2:3] +; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[4:5], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-NEXT: s_and_b32 s0, 1, s12 +; GFX11-NEXT: s_cmp_eq_u64 s[6:7], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[6:7], 0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: s_ashr_i32 s0, s11, 31 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX11-NEXT: s_mov_b32 s1, 0 +; GFX11-NEXT: s_cmp_lg_u32 s1, 0 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v3, v2 :: v_dual_mov_b32 v2, s9 +; GFX11-NEXT: v_mov_b32_e32 v3, s11 +; GFX11-NEXT: s_addc_u32 s1, s0, 0 +; GFX11-NEXT: s_addc_u32 s2, s0, 0 +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v1, s8 +; GFX11-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, s10 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s3, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: ; return to shader part epilog %result = call i128 @llvm.ssub.sat.i128(i128 %lhs, i128 %rhs) ret i128 %result } @@ -5913,94 +6031,181 @@ ; GFX9-NEXT: v_readfirstlane_b32 s7, v3 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_ssubsat_v2i128: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_sub_u32 s16, s0, s8 -; GFX10PLUS-NEXT: s_subb_u32 s17, s1, s9 -; GFX10PLUS-NEXT: s_subb_u32 s18, s2, s10 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] -; GFX10PLUS-NEXT: s_subb_u32 s19, s3, s11 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] -; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] -; GFX10PLUS-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10PLUS-NEXT: s_and_b32 s0, 1, s20 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[10:11], 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 -; GFX10PLUS-NEXT: s_cselect_b32 s1, 1, 0 -; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10PLUS-NEXT: s_mov_b32 s20, 0 -; GFX10PLUS-NEXT: s_and_b32 s1, 1, s1 -; GFX10PLUS-NEXT: s_ashr_i32 s0, s19, 31 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 -; GFX10PLUS-NEXT: s_addc_u32 s2, s0, 0 -; GFX10PLUS-NEXT: s_addc_u32 s3, s0, 0x80000000 -; GFX10PLUS-NEXT: s_sub_u32 s8, s4, s12 -; GFX10PLUS-NEXT: s_subb_u32 s9, s5, s13 -; GFX10PLUS-NEXT: s_subb_u32 s10, s6, s14 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5] -; GFX10PLUS-NEXT: s_subb_u32 s11, s7, s15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[10:11], s[6:7] -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, s17 -; GFX10PLUS-NEXT: v_mov_b32_e32 v7, s11 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 -; GFX10PLUS-NEXT: v_cmp_lt_i64_e64 s4, s[10:11], s[6:7] -; GFX10PLUS-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 -; GFX10PLUS-NEXT: v_xor_b32_e32 v0, v1, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, s16 -; GFX10PLUS-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 -; GFX10PLUS-NEXT: s_and_b32 s4, 1, s16 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[14:15], 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 -; GFX10PLUS-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 -; GFX10PLUS-NEXT: s_cselect_b32 s5, 1, 0 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: s_and_b32 s5, 1, s5 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, s18 -; GFX10PLUS-NEXT: v_mov_b32_e32 v5, s19 -; GFX10PLUS-NEXT: v_mov_b32_e32 v6, s9 -; GFX10PLUS-NEXT: v_xor_b32_e32 v3, v4, v3 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo -; GFX10PLUS-NEXT: s_ashr_i32 s0, s11, 31 -; GFX10PLUS-NEXT: s_cmp_lg_u32 s20, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo -; GFX10PLUS-NEXT: v_and_b32_e32 v3, 1, v3 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v5, s8 -; GFX10PLUS-NEXT: s_addc_u32 s1, s0, 0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, s10 -; GFX10PLUS-NEXT: s_addc_u32 s2, s0, 0 -; GFX10PLUS-NEXT: s_addc_u32 s3, s0, 0x80000000 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v4 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v5 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v6 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_ssubsat_v2i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sub_u32 s16, s0, s8 +; GFX10-NEXT: s_subb_u32 s17, s1, s9 +; GFX10-NEXT: s_subb_u32 s18, s2, s10 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] +; GFX10-NEXT: s_subb_u32 s19, s3, s11 +; GFX10-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX10-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, 1, s20 +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX10-NEXT: s_cselect_b32 s1, 1, 0 +; GFX10-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_mov_b32 s20, 0 +; GFX10-NEXT: s_and_b32 s1, 1, s1 +; GFX10-NEXT: s_ashr_i32 s0, s19, 31 +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX10-NEXT: s_addc_u32 s1, s0, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX10-NEXT: s_sub_u32 s8, s4, s12 +; GFX10-NEXT: s_subb_u32 s9, s5, s13 +; GFX10-NEXT: s_subb_u32 s10, s6, s14 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5] +; GFX10-NEXT: s_subb_u32 s11, s7, s15 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX10-NEXT: s_cmp_eq_u64 s[10:11], s[6:7] +; GFX10-NEXT: v_mov_b32_e32 v2, s17 +; GFX10-NEXT: v_mov_b32_e32 v7, s11 +; GFX10-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX10-NEXT: v_cmp_lt_i64_e64 s4, s[10:11], s[6:7] +; GFX10-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 +; GFX10-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v1, s16 +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX10-NEXT: s_and_b32 s4, 1, s16 +; GFX10-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX10-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX10-NEXT: s_cselect_b32 s5, 1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_and_b32 s5, 1, s5 +; GFX10-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 +; GFX10-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX10-NEXT: v_cndmask_b32_e32 v4, v6, v5, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, s18 +; GFX10-NEXT: v_mov_b32_e32 v5, s19 +; GFX10-NEXT: v_mov_b32_e32 v6, s9 +; GFX10-NEXT: v_xor_b32_e32 v3, v4, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX10-NEXT: s_ashr_i32 s0, s11, 31 +; GFX10-NEXT: s_cmp_lg_u32 s20, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo +; GFX10-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo +; GFX10-NEXT: v_mov_b32_e32 v5, s8 +; GFX10-NEXT: s_addc_u32 s1, s0, 0 +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX10-NEXT: v_mov_b32_e32 v3, s10 +; GFX10-NEXT: s_addc_u32 s2, s0, 0 +; GFX10-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX10-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s0, v1 +; GFX10-NEXT: v_readfirstlane_b32 s1, v2 +; GFX10-NEXT: v_readfirstlane_b32 s2, v0 +; GFX10-NEXT: v_readfirstlane_b32 s3, v4 +; GFX10-NEXT: v_readfirstlane_b32 s4, v5 +; GFX10-NEXT: v_readfirstlane_b32 s5, v6 +; GFX10-NEXT: v_readfirstlane_b32 s6, v3 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_ssubsat_v2i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_u32 s16, s0, s8 +; GFX11-NEXT: s_subb_u32 s17, s1, s9 +; GFX11-NEXT: s_subb_u32 s18, s2, s10 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[16:17], s[0:1] +; GFX11-NEXT: s_subb_u32 s19, s3, s11 +; GFX11-NEXT: s_cmp_eq_u64 s[18:19], s[2:3] +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_i64_e64 s0, s[18:19], s[2:3] +; GFX11-NEXT: v_cmp_gt_u64_e64 s2, s[8:9], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-NEXT: s_and_b32 s0, 1, s20 +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s2 +; GFX11-NEXT: v_cmp_gt_i64_e64 s2, s[10:11], 0 +; GFX11-NEXT: s_cselect_b32 s1, 1, 0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: s_mov_b32 s20, 0 +; GFX11-NEXT: s_and_b32 s1, 1, s1 +; GFX11-NEXT: s_ashr_i32 s0, s19, 31 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s2 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s1 +; GFX11-NEXT: s_addc_u32 s1, s0, 0 +; GFX11-NEXT: s_addc_u32 s2, s0, 0 +; GFX11-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX11-NEXT: s_sub_u32 s8, s4, s12 +; GFX11-NEXT: v_cndmask_b32_e32 v1, v3, v2, vcc_lo +; GFX11-NEXT: s_subb_u32 s9, s5, s13 +; GFX11-NEXT: s_subb_u32 s10, s6, s14 +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[8:9], s[4:5] +; GFX11-NEXT: s_subb_u32 s11, s7, s15 +; GFX11-NEXT: v_xor_b32_e32 v0, v1, v0 +; GFX11-NEXT: s_cmp_eq_u64 s[10:11], s[6:7] +; GFX11-NEXT: v_mov_b32_e32 v1, s16 +; GFX11-NEXT: s_cselect_b32 s16, 1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v3, 0, 1, s4 +; GFX11-NEXT: v_cmp_lt_i64_e64 s4, s[10:11], s[6:7] +; GFX11-NEXT: v_cmp_gt_u64_e64 s6, s[12:13], 0 +; GFX11-NEXT: v_dual_mov_b32 v7, s11 :: v_dual_and_b32 v0, 1, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v4, 0, 1, s4 +; GFX11-NEXT: s_and_b32 s4, 1, s16 +; GFX11-NEXT: s_cmp_eq_u64 s[14:15], 0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, 0, 1, s6 +; GFX11-NEXT: v_cmp_gt_i64_e64 s6, s[14:15], 0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s4 +; GFX11-NEXT: s_cselect_b32 s5, 1, 0 +; GFX11-NEXT: v_mov_b32_e32 v2, s17 +; GFX11-NEXT: s_and_b32 s5, 1, s5 +; GFX11-NEXT: v_cndmask_b32_e32 v3, v4, v3, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v6, 0, 1, s6 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s5 +; GFX11-NEXT: v_dual_cndmask_b32 v4, v6, v5 :: v_dual_mov_b32 v5, s19 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_mov_b32_e32 v6, s9 +; GFX11-NEXT: v_xor_b32_e32 v3, v4, v3 +; GFX11-NEXT: v_mov_b32_e32 v0, s18 +; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, s0, vcc_lo +; GFX11-NEXT: s_ashr_i32 s0, s11, 31 +; GFX11-NEXT: s_cmp_lg_u32 s20, 0 +; GFX11-NEXT: v_and_b32_e32 v3, 1, v3 +; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, s1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, v5, s3, vcc_lo +; GFX11-NEXT: v_mov_b32_e32 v5, s8 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: v_mov_b32_e32 v3, s10 +; GFX11-NEXT: s_addc_u32 s1, s0, 0 +; GFX11-NEXT: s_addc_u32 s2, s0, 0 +; GFX11-NEXT: s_addc_u32 s3, s0, 0x80000000 +; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, s0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, s1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, s2, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, s3, vcc_lo +; GFX11-NEXT: v_readfirstlane_b32 s0, v1 +; GFX11-NEXT: v_readfirstlane_b32 s1, v2 +; GFX11-NEXT: v_readfirstlane_b32 s2, v0 +; GFX11-NEXT: v_readfirstlane_b32 s3, v4 +; GFX11-NEXT: v_readfirstlane_b32 s4, v5 +; GFX11-NEXT: v_readfirstlane_b32 s5, v6 +; GFX11-NEXT: v_readfirstlane_b32 s6, v3 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.ssub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/uaddsat.ll @@ -3549,16 +3549,16 @@ ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[2:3], v[10:11] ; GFX11-NEXT: v_cndmask_b32_e32 v8, v9, v8, vcc_lo ; GFX11-NEXT: v_cmp_eq_u64_e32 vcc_lo, v[6:7], v[14:15] -; GFX11-NEXT: v_and_b32_e32 v8, 1, v8 ; GFX11-NEXT: v_cndmask_b32_e32 v9, v13, v12, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 ; GFX11-NEXT: v_and_b32_e32 v9, 1, v9 +; GFX11-NEXT: v_and_b32_e32 v8, 1, v8 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v9 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v8 +; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v0, v0, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v1, v1, -1, vcc_lo -; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v9 ; GFX11-NEXT: v_cndmask_b32_e64 v2, v2, -1, vcc_lo ; GFX11-NEXT: v_cndmask_b32_e64 v3, v3, -1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e64 v4, v4, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v5, v5, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v6, v6, -1, s0 ; GFX11-NEXT: v_cndmask_b32_e64 v7, v7, -1, s0 @@ -3760,55 +3760,104 @@ ; GFX9-NEXT: v_readfirstlane_b32 s7, v3 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_uaddsat_v2i128: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_add_u32 s0, s0, s8 -; GFX10PLUS-NEXT: s_addc_u32 s1, s1, s9 -; GFX10PLUS-NEXT: s_addc_u32 s2, s2, s10 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9] -; GFX10PLUS-NEXT: s_addc_u32 s3, s3, s11 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX10PLUS-NEXT: s_cselect_b32 s16, 1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[10:11] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8 -; GFX10PLUS-NEXT: s_and_b32 s8, 1, s16 -; GFX10PLUS-NEXT: s_add_u32 s4, s4, s12 -; GFX10PLUS-NEXT: s_addc_u32 s5, s5, s13 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13] -; GFX10PLUS-NEXT: s_addc_u32 s6, s6, s14 -; GFX10PLUS-NEXT: s_addc_u32 s7, s7, s15 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15] -; GFX10PLUS-NEXT: s_cselect_b32 s8, 1, 0 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: s_and_b32 s8, 1, s8 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, s4, -1, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, s5, -1, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v6, s6, -1, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v7, s7, -1, s0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_uaddsat_v2i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_add_u32 s0, s0, s8 +; GFX10-NEXT: s_addc_u32 s1, s1, s9 +; GFX10-NEXT: s_addc_u32 s2, s2, s10 +; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9] +; GFX10-NEXT: s_addc_u32 s3, s3, s11 +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] +; GFX10-NEXT: s_cselect_b32 s16, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 +; GFX10-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8 +; GFX10-NEXT: s_and_b32 s8, 1, s16 +; GFX10-NEXT: s_add_u32 s4, s4, s12 +; GFX10-NEXT: s_addc_u32 s5, s5, s13 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 +; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13] +; GFX10-NEXT: s_addc_u32 s6, s6, s14 +; GFX10-NEXT: s_addc_u32 s7, s7, s15 +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9 +; GFX10-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15] +; GFX10-NEXT: s_cselect_b32 s8, 1, 0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: s_and_b32 s8, 1, s8 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_cndmask_b32_e64 v4, s4, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, s5, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, s6, -1, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, s7, -1, s0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_uaddsat_v2i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_add_u32 s0, s0, s8 +; GFX11-NEXT: s_addc_u32 s1, s1, s9 +; GFX11-NEXT: s_addc_u32 s2, s2, s10 +; GFX11-NEXT: v_cmp_lt_u64_e64 s8, s[0:1], s[8:9] +; GFX11-NEXT: s_addc_u32 s3, s3, s11 +; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] +; GFX11-NEXT: s_cselect_b32 s16, 1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s8 +; GFX11-NEXT: v_cmp_lt_u64_e64 s8, s[2:3], s[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s8 +; GFX11-NEXT: s_and_b32 s8, 1, s16 +; GFX11-NEXT: s_add_u32 s4, s4, s12 +; GFX11-NEXT: s_addc_u32 s5, s5, s13 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 +; GFX11-NEXT: v_cmp_lt_u64_e64 s9, s[4:5], s[12:13] +; GFX11-NEXT: s_addc_u32 s6, s6, s14 +; GFX11-NEXT: s_addc_u32 s7, s7, s15 +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s9 +; GFX11-NEXT: v_cmp_lt_u64_e64 s9, s[6:7], s[14:15] +; GFX11-NEXT: s_cselect_b32 s8, 1, 0 +; GFX11-NEXT: s_and_b32 s8, 1, s8 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s8 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s9 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v1 :: v_dual_and_b32 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, s0, -1, vcc_lo +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v1, s1, -1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, s2, -1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, s3, -1, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, s4, -1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, s5, -1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, s6, -1, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, s7, -1, s0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.uadd.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/usubsat.ll @@ -3630,55 +3630,104 @@ ; GFX9-NEXT: v_readfirstlane_b32 s7, v3 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: s_usubsat_v2i128: -; GFX10PLUS: ; %bb.0: -; GFX10PLUS-NEXT: s_sub_u32 s16, s0, s8 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9] -; GFX10PLUS-NEXT: s_subb_u32 s17, s1, s9 -; GFX10PLUS-NEXT: s_subb_u32 s18, s2, s10 -; GFX10PLUS-NEXT: s_subb_u32 s19, s3, s11 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[10:11] -; GFX10PLUS-NEXT: s_cselect_b32 s20, 1, 0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 -; GFX10PLUS-NEXT: s_and_b32 s0, 1, s20 -; GFX10PLUS-NEXT: s_sub_u32 s2, s4, s12 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13] -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10PLUS-NEXT: s_subb_u32 s1, s5, s13 -; GFX10PLUS-NEXT: s_subb_u32 s8, s6, s14 -; GFX10PLUS-NEXT: s_subb_u32 s3, s7, s15 -; GFX10PLUS-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 -; GFX10PLUS-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[14:15] -; GFX10PLUS-NEXT: s_cselect_b32 s0, 1, 0 -; GFX10PLUS-NEXT: s_and_b32 s0, 1, s0 -; GFX10PLUS-NEXT: v_and_b32_e32 v0, 1, v0 -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 -; GFX10PLUS-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 -; GFX10PLUS-NEXT: v_and_b32_e32 v1, 1, v1 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v0, s16, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v2, s18, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v3, s19, 0, vcc_lo -; GFX10PLUS-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v1, s17, 0, vcc_lo -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v4, s2, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v5, s1, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v6, s8, 0, s0 -; GFX10PLUS-NEXT: v_cndmask_b32_e64 v7, s3, 0, s0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s0, v0 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s1, v1 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s2, v2 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s3, v3 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s4, v4 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s5, v5 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s6, v6 -; GFX10PLUS-NEXT: v_readfirstlane_b32 s7, v7 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: s_usubsat_v2i128: +; GFX10: ; %bb.0: +; GFX10-NEXT: s_sub_u32 s16, s0, s8 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9] +; GFX10-NEXT: s_subb_u32 s17, s1, s9 +; GFX10-NEXT: s_subb_u32 s18, s2, s10 +; GFX10-NEXT: s_subb_u32 s19, s3, s11 +; GFX10-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] +; GFX10-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX10-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[10:11] +; GFX10-NEXT: s_cselect_b32 s20, 1, 0 +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX10-NEXT: s_and_b32 s0, 1, s20 +; GFX10-NEXT: s_sub_u32 s2, s4, s12 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13] +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: s_subb_u32 s1, s5, s13 +; GFX10-NEXT: s_subb_u32 s8, s6, s14 +; GFX10-NEXT: s_subb_u32 s3, s7, s15 +; GFX10-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX10-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX10-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[14:15] +; GFX10-NEXT: s_cselect_b32 s0, 1, 0 +; GFX10-NEXT: s_and_b32 s0, 1, s0 +; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 +; GFX10-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX10-NEXT: v_cndmask_b32_e32 v1, v2, v1, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX10-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v0, s16, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v2, s18, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v3, s19, 0, vcc_lo +; GFX10-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX10-NEXT: v_cndmask_b32_e64 v1, s17, 0, vcc_lo +; GFX10-NEXT: v_cndmask_b32_e64 v4, s2, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v5, s1, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v6, s8, 0, s0 +; GFX10-NEXT: v_cndmask_b32_e64 v7, s3, 0, s0 +; GFX10-NEXT: v_readfirstlane_b32 s0, v0 +; GFX10-NEXT: v_readfirstlane_b32 s1, v1 +; GFX10-NEXT: v_readfirstlane_b32 s2, v2 +; GFX10-NEXT: v_readfirstlane_b32 s3, v3 +; GFX10-NEXT: v_readfirstlane_b32 s4, v4 +; GFX10-NEXT: v_readfirstlane_b32 s5, v5 +; GFX10-NEXT: v_readfirstlane_b32 s6, v6 +; GFX10-NEXT: v_readfirstlane_b32 s7, v7 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: s_usubsat_v2i128: +; GFX11: ; %bb.0: +; GFX11-NEXT: s_sub_u32 s16, s0, s8 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[0:1], s[8:9] +; GFX11-NEXT: s_subb_u32 s17, s1, s9 +; GFX11-NEXT: s_subb_u32 s18, s2, s10 +; GFX11-NEXT: s_subb_u32 s19, s3, s11 +; GFX11-NEXT: s_cmp_eq_u64 s[2:3], s[10:11] +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s0 +; GFX11-NEXT: v_cmp_lt_u64_e64 s0, s[2:3], s[10:11] +; GFX11-NEXT: s_cselect_b32 s20, 1, 0 +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s0 +; GFX11-NEXT: s_and_b32 s0, 1, s20 +; GFX11-NEXT: s_sub_u32 s2, s4, s12 +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[4:5], s[12:13] +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: s_subb_u32 s1, s5, s13 +; GFX11-NEXT: s_subb_u32 s8, s6, s14 +; GFX11-NEXT: s_subb_u32 s3, s7, s15 +; GFX11-NEXT: s_cmp_eq_u64 s[6:7], s[14:15] +; GFX11-NEXT: v_cndmask_b32_e32 v0, v1, v0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, 0, 1, s4 +; GFX11-NEXT: v_cmp_lt_u64_e64 s4, s[6:7], s[14:15] +; GFX11-NEXT: s_cselect_b32 s0, 1, 0 +; GFX11-NEXT: s_and_b32 s0, 1, s0 +; GFX11-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, s4 +; GFX11-NEXT: v_dual_cndmask_b32 v1, v2, v1 :: v_dual_and_b32 v0, 1, v0 +; GFX11-NEXT: v_and_b32_e32 v1, 1, v1 +; GFX11-NEXT: v_cmp_ne_u32_e32 vcc_lo, 0, v0 +; GFX11-NEXT: v_cmp_ne_u32_e64 s0, 0, v1 +; GFX11-NEXT: v_cndmask_b32_e64 v0, s16, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v1, s17, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v2, s18, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v3, s19, 0, vcc_lo +; GFX11-NEXT: v_cndmask_b32_e64 v4, s2, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v5, s1, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v6, s8, 0, s0 +; GFX11-NEXT: v_cndmask_b32_e64 v7, s3, 0, s0 +; GFX11-NEXT: v_readfirstlane_b32 s0, v0 +; GFX11-NEXT: v_readfirstlane_b32 s1, v1 +; GFX11-NEXT: v_readfirstlane_b32 s2, v2 +; GFX11-NEXT: v_readfirstlane_b32 s3, v3 +; GFX11-NEXT: v_readfirstlane_b32 s4, v4 +; GFX11-NEXT: v_readfirstlane_b32 s5, v5 +; GFX11-NEXT: v_readfirstlane_b32 s6, v6 +; GFX11-NEXT: v_readfirstlane_b32 s7, v7 +; GFX11-NEXT: ; return to shader part epilog %result = call <2 x i128> @llvm.usub.sat.v2i128(<2 x i128> %lhs, <2 x i128> %rhs) ret <2 x i128> %result } diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -212,10 +212,9 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, s3 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -454,11 +453,10 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -1569,8 +1567,8 @@ ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 ; GFX1132-NEXT: s_mul_i32 s5, s2, s5 ; GFX1132-NEXT: s_add_i32 s7, s7, s6 -; GFX1132-NEXT: v_mov_b32_e32 v0, s5 -; GFX1132-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] @@ -1878,10 +1876,9 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, s3 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -2124,11 +2121,10 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -3265,8 +3261,8 @@ ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 ; GFX1132-NEXT: s_mul_i32 s5, s2, s5 ; GFX1132-NEXT: s_add_i32 s7, s7, s6 -; GFX1132-NEXT: v_mov_b32_e32 v0, s5 -; GFX1132-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] @@ -5037,8 +5033,7 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] @@ -5648,8 +5643,7 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB20_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] @@ -6256,8 +6250,7 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB22_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] @@ -6864,8 +6857,7 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB24_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/build_vector.ll b/llvm/test/CodeGen/AMDGPU/build_vector.ll --- a/llvm/test/CodeGen/AMDGPU/build_vector.ll +++ b/llvm/test/CodeGen/AMDGPU/build_vector.ll @@ -2,7 +2,7 @@ ; RUN: llc < %s -march=amdgcn -verify-machineinstrs | FileCheck %s --check-prefixes=GFX6,GFX678,ALL ; RUN: llc < %s -march=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs | FileCheck %s --check-prefixes=GFX8,GFX678,ALL ; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1030 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX10,GFX1011,ALL -; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX1011,ALL +; RUN: llc < %s -mtriple=amdgcn-amd-amdpal -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs | FileCheck %s --check-prefixes=GFX11,GFX1011,ALL ; ALL-LABEL: {{^}}build_vector2: ; R600: MOV diff --git a/llvm/test/CodeGen/AMDGPU/clamp.ll b/llvm/test/CodeGen/AMDGPU/clamp.ll --- a/llvm/test/CodeGen/AMDGPU/clamp.ll +++ b/llvm/test/CodeGen/AMDGPU/clamp.ll @@ -1358,8 +1358,7 @@ ; GFX11-LABEL: v_clamp_constants_to_one_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v1, 1.0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1407,8 +1406,7 @@ ; GFX11-LABEL: v_clamp_constants_to_zero_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1457,8 +1455,7 @@ ; GFX11-LABEL: v_clamp_constant_preserve_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0.5 +; GFX11-NEXT: v_dual_mov_b32 v1, 0.5 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1507,8 +1504,7 @@ ; GFX11-LABEL: v_clamp_constant_preserve_denorm_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x7fffff +; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fffff :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1556,8 +1552,7 @@ ; GFX11-LABEL: v_clamp_constant_qnan_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1605,8 +1600,7 @@ ; GFX11-LABEL: v_clamp_constant_snan_f32: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2335,8 +2329,7 @@ ; GFX11-LABEL: v_clamp_constant_qnan_f32_no_dx10_clamp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x7fc00000 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x7fc00000 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2385,8 +2378,7 @@ ; GFX11-LABEL: v_clamp_constant_snan_f32_no_dx10_clamp: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x7f800001 +; GFX11-NEXT: v_dual_mov_b32 v1, 0x7f800001 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll --- a/llvm/test/CodeGen/AMDGPU/cluster_stores.ll +++ b/llvm/test/CodeGen/AMDGPU/cluster_stores.ll @@ -106,15 +106,13 @@ ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: flat_load_b32 v2, v[0:1] ; GFX11-NEXT: flat_load_b32 v3, v[0:1] offset:8 ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX11-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(3) @@ -246,15 +244,13 @@ ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x24 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: flat_load_b32 v2, v[0:1] offset:8 ; GFX11-NEXT: flat_load_b32 v3, v[0:1] ; GFX11-NEXT: flat_load_b32 v4, v[0:1] offset:16 ; GFX11-NEXT: flat_load_b32 v5, v[0:1] offset:24 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-NEXT: s_waitcnt vmcnt(3) lgkmcnt(3) ; GFX11-NEXT: v_add_nc_u32_e32 v2, 1, v2 ; GFX11-NEXT: s_waitcnt vmcnt(2) lgkmcnt(2) @@ -341,10 +337,8 @@ ; GFX11-NEXT: image_load v[2:5], v[2:3], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX11-NEXT: image_load v[6:9], v[6:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: v_dual_add_f32 v2, v2, v6 :: v_dual_add_f32 v5, v5, v9 +; GFX11-NEXT: v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v3, v3, v7 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -398,10 +392,8 @@ ; GFX11-NEXT: image_load_mip v[2:5], [v0, v1, v6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX11-NEXT: image_load_mip v[6:9], [v0, v1, v6], s[8:15] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: v_dual_add_f32 v5, v5, v9 :: v_dual_add_f32 v4, v4, v8 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[16:23] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -476,30 +468,23 @@ ; ; GFX11-LABEL: cluster_image_sample: ; GFX11: ; %bb.0: ; %entry -; GFX11-NEXT: v_cvt_f32_i32_e32 v8, v0 ; GFX11-NEXT: v_cvt_f32_i32_e32 v9, v1 -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_mov_b32_e32 v10, 1.0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_add_f32_e32 v2, 1.0, v8 -; GFX11-NEXT: v_add_f32_e32 v3, 1.0, v9 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v5, v4 -; GFX11-NEXT: v_mov_b32_e32 v6, v4 -; GFX11-NEXT: v_mov_b32_e32 v7, v4 -; GFX11-NEXT: v_add_f32_e32 v8, 2.0, v8 -; GFX11-NEXT: v_add_f32_e32 v9, 2.0, v9 -; GFX11-NEXT: v_mov_b32_e32 v11, v10 +; GFX11-NEXT: v_cvt_f32_i32_e32 v8, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_add_f32 v3, 1.0, v9 +; GFX11-NEXT: v_dual_mov_b32 v10, 1.0 :: v_dual_mov_b32 v7, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_add_f32 v2, 1.0, v8 :: v_dual_mov_b32 v5, v4 +; GFX11-NEXT: v_dual_mov_b32 v6, v4 :: v_dual_add_f32 v9, 2.0, v9 +; GFX11-NEXT: v_dual_add_f32 v8, 2.0, v8 :: v_dual_mov_b32 v11, v10 ; GFX11-NEXT: v_mov_b32_e32 v12, v10 ; GFX11-NEXT: v_mov_b32_e32 v13, v10 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: image_sample_d v[2:5], v[2:7], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: image_sample_d v[6:9], v[8:13], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_add_f32_e32 v5, v5, v9 -; GFX11-NEXT: v_add_f32_e32 v4, v4, v8 -; GFX11-NEXT: v_add_f32_e32 v3, v3, v7 -; GFX11-NEXT: v_add_f32_e32 v2, v2, v6 +; GFX11-NEXT: v_dual_add_f32 v4, v4, v8 :: v_dual_add_f32 v5, v5, v9 +; GFX11-NEXT: v_dual_add_f32 v3, v3, v7 :: v_dual_add_f32 v2, v2, v6 ; GFX11-NEXT: image_store v[2:5], v[0:1], s[12:19] dmask:0xf dim:SQ_RSRC_IMG_2D unorm ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll --- a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll +++ b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll @@ -8,17 +8,16 @@ ; GCN-NEXT: s_mov_b32 s1, exec_lo ; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 ; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15 ; GCN-NEXT: lds_param_load v4, attr1.y wait_vdst:15 ; GCN-NEXT: lds_param_load v5, attr1.z wait_vdst:15 ; GCN-NEXT: lds_param_load v6, attr1.w wait_vdst:15 -; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-NEXT: v_mbcnt_hi_u32_b32 v7, -1, v7 -; GCN-NEXT: v_and_b32_e32 v7, 1, v7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_and_b32 v7, 1, v7 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v9, v6, v2, v6 @@ -32,9 +31,8 @@ ; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6] -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GCN-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GCN-NEXT: v_dual_cndmask_b32 v3, v4, v5 :: v_dual_cndmask_b32 v4, v5, v4 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v6, vcc_lo ; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo ; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -47,8 +47,7 @@ ; GFX11-SDAG-LABEL: soff1_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 @@ -64,13 +63,11 @@ ; GFX11-GISEL-LABEL: soff1_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -135,11 +132,10 @@ ; GFX11-SDAG-LABEL: soff1_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -154,13 +150,11 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -225,10 +219,8 @@ ; GFX11-SDAG-LABEL: soff1_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc @@ -244,13 +236,11 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -315,8 +305,7 @@ ; GFX11-SDAG-LABEL: soff2_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -334,14 +323,12 @@ ; GFX11-GISEL-LABEL: soff2_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -408,13 +395,11 @@ ; GFX11-SDAG-LABEL: soff2_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -429,14 +414,12 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -503,10 +486,8 @@ ; GFX11-SDAG-LABEL: soff2_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -524,14 +505,12 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -596,9 +575,8 @@ ; GFX11-SDAG-LABEL: soff4_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v4, 4 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -616,14 +594,12 @@ ; GFX11-GISEL-LABEL: soff4_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -690,8 +666,7 @@ ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -712,14 +687,12 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -786,10 +759,8 @@ ; GFX11-SDAG-LABEL: soff4_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -807,14 +778,12 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -58,10 +58,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:64 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:48 @@ -175,10 +173,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:64 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:48 @@ -239,10 +235,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 @@ -313,10 +307,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 @@ -685,9 +677,8 @@ ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -748,9 +739,8 @@ ; ; GFX11-PAL-LABEL: store_load_vindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -815,10 +805,9 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -871,10 +860,9 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1035,10 +1023,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:288 @@ -1164,10 +1150,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:288 @@ -1237,10 +1221,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 @@ -1319,10 +1301,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 @@ -1764,8 +1744,7 @@ ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 @@ -1859,8 +1838,7 @@ ; ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 @@ -1927,9 +1905,8 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 @@ -1993,9 +1970,8 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 @@ -2095,10 +2071,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo ; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 @@ -2234,10 +2208,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo ; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 @@ -2317,10 +2289,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 @@ -2441,10 +2411,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 @@ -2871,8 +2839,7 @@ ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2968,8 +2935,7 @@ ; ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3037,9 +3003,8 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3107,9 +3072,8 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3190,8 +3154,7 @@ ; ; GFX11-LABEL: store_load_large_imm_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_mov_b32_e32 v0, 13 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x3000 +; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3285,8 +3248,7 @@ ; ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 13 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x3000 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3344,8 +3306,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 13 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x3000 +; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3406,8 +3367,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 13 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x3000 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3601,8 +3561,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc @@ -3647,8 +3606,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc @@ -3698,8 +3656,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc @@ -3744,8 +3701,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc @@ -3797,8 +3753,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-NEXT: v_mov_b32_e32 v3, 3 ; GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3847,8 +3802,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3 ; GFX11-PAL-NEXT: scratch_store_b96 v0, v[1:3], off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3904,10 +3858,8 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-NEXT: v_mov_b32_e32 v3, 3 -; GFX11-NEXT: v_mov_b32_e32 v4, 4 +; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc @@ -3958,10 +3910,8 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3 -; GFX11-PAL-NEXT: v_mov_b32_e32 v4, 4 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-PAL-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 ; GFX11-PAL-NEXT: scratch_store_b128 v0, v[1:4], off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc @@ -4109,8 +4059,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 ; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc @@ -4167,8 +4116,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 ; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc @@ -4243,8 +4191,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 16 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x810 +; GFX11-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 ; GFX11-NEXT: ;;#ASMEND @@ -4344,8 +4291,7 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 16 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x810 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 ; GFX11-PAL-NEXT: ;;#ASMSTART ; GFX11-PAL-NEXT: ; use v0 ; GFX11-PAL-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll --- a/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx11-user-sgpr-init16-bug.ll @@ -1,7 +1,7 @@ -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck -check-prefix=GCN %s -; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=0 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s +; RUN: llc -global-isel=1 -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1102 -amdgpu-enable-vopd=0 < %s | FileCheck -check-prefix=GCN %s ; There aren't any stack objects, but we still enable the ; private_segment_wavefront_offset to get to 16, and the workgroup ID diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -378,6 +378,7 @@ ; GCN-O1-NEXT: Branch Probability Basic Block Placement ; GCN-O1-NEXT: Insert fentry calls ; GCN-O1-NEXT: Insert XRay ops +; GCN-O1-NEXT: GCN Create VOPD Instructions ; GCN-O1-NEXT: SI Memory Legalizer ; GCN-O1-NEXT: MachineDominator Tree Construction ; GCN-O1-NEXT: Machine Natural Loop Construction @@ -667,6 +668,7 @@ ; GCN-O1-OPTS-NEXT: Branch Probability Basic Block Placement ; GCN-O1-OPTS-NEXT: Insert fentry calls ; GCN-O1-OPTS-NEXT: Insert XRay ops +; GCN-O1-OPTS-NEXT: GCN Create VOPD Instructions ; GCN-O1-OPTS-NEXT: SI Memory Legalizer ; GCN-O1-OPTS-NEXT: MachineDominator Tree Construction ; GCN-O1-OPTS-NEXT: Machine Natural Loop Construction @@ -958,6 +960,7 @@ ; GCN-O2-NEXT: Branch Probability Basic Block Placement ; GCN-O2-NEXT: Insert fentry calls ; GCN-O2-NEXT: Insert XRay ops +; GCN-O2-NEXT: GCN Create VOPD Instructions ; GCN-O2-NEXT: SI Memory Legalizer ; GCN-O2-NEXT: MachineDominator Tree Construction ; GCN-O2-NEXT: Machine Natural Loop Construction @@ -1262,6 +1265,7 @@ ; GCN-O3-NEXT: Branch Probability Basic Block Placement ; GCN-O3-NEXT: Insert fentry calls ; GCN-O3-NEXT: Insert XRay ops +; GCN-O3-NEXT: GCN Create VOPD Instructions ; GCN-O3-NEXT: SI Memory Legalizer ; GCN-O3-NEXT: MachineDominator Tree Construction ; GCN-O3-NEXT: Machine Natural Loop Construction diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.ds.ordered.add.gfx11.ll @@ -1,5 +1,5 @@ -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GCN,FUNC %s ; FUNC-LABEL: {{^}}ds_ordered_add: ; GCN-DAG: v_mov_b32_e32 v[[INCR:[0-9]+]], 31 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.exp.ll @@ -1,6 +1,6 @@ ; RUN: llc -march=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,PREGFX11 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX10,PREGFX11 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -strict-whitespace -check-prefixes=GCN,GFX11 %s declare void @llvm.amdgcn.exp.f32(i32, i32, float, float, float, float, i1, i1) #1 declare void @llvm.amdgcn.exp.i32(i32, i32, i32, i32, i32, i32, i1, i1) #1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -8,13 +8,12 @@ ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -38,13 +37,12 @@ ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -124,17 +124,14 @@ ; ; GFX11-LABEL: load_1d_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, v6 -; GFX11-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-NEXT: v_mov_b32_e32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v6, v4, s[8:9] @@ -231,17 +228,14 @@ ; ; GFX11-LABEL: load_1d_lwe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, v6 -; GFX11-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-NEXT: v_mov_b32_e32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v6, v4, s[8:9] @@ -377,18 +371,15 @@ ; ; GFX11-LABEL: load_2d_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v0, v7 -; GFX11-NEXT: v_mov_b32_e32 v1, v8 -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mov_b32_e32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -528,19 +519,15 @@ ; ; GFX11-LABEL: load_3d_tfe_lwe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -680,19 +667,15 @@ ; ; GFX11-LABEL: load_cube_lwe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -828,18 +811,15 @@ ; ; GFX11-LABEL: load_1darray_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v0, v7 -; GFX11-NEXT: v_mov_b32_e32 v1, v8 -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mov_b32_e32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -979,19 +959,15 @@ ; ; GFX11-LABEL: load_2darray_lwe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -1131,19 +1107,15 @@ ; ; GFX11-LABEL: load_2dmsaa_both: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -1287,20 +1259,16 @@ ; ; GFX11-LABEL: load_2darraymsaa_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_mov_b32_e32 v8, v3 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; GFX11-NEXT: v_mov_b32_e32 v13, v9 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-NEXT: v_mov_b32_e32 v1, v10 -; GFX11-NEXT: v_mov_b32_e32 v2, v11 -; GFX11-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] @@ -1436,18 +1404,15 @@ ; ; GFX11-LABEL: load_mip_1d_lwe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v0, v7 -; GFX11-NEXT: v_mov_b32_e32 v1, v8 -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mov_b32_e32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -1587,19 +1552,15 @@ ; ; GFX11-LABEL: load_mip_2d_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -1996,15 +1957,12 @@ ; ; GFX11-LABEL: load_1d_tfe_V4_dmask3: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, 0 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-NEXT: v_mov_b32_e32 v8, v5 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: v_mov_b32_e32 v1, v6 -; GFX11-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-NEXT: v_mov_b32_e32 v3, v8 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v2, v7 ; GFX11-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v5, v3, s[8:9] @@ -2089,13 +2047,11 @@ ; ; GFX11-LABEL: load_1d_tfe_V4_dmask2: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v4 +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 -; GFX11-NEXT: v_mov_b32_e32 v2, v6 +; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v4, v2, s[8:9] @@ -2174,11 +2130,9 @@ ; ; GFX11-LABEL: load_1d_tfe_V4_dmask1: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v3, v1, s[8:9] @@ -2257,11 +2211,9 @@ ; ; GFX11-LABEL: load_1d_tfe_V2_dmask1: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v3, v1, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { ; GFX9-LABEL: sample_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -117,8 +117,7 @@ ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v5, v4 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -199,15 +198,24 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: image_sample_c_d_1d_v2f16_tfe: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 -; GFX10PLUS-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: image_sample_c_d_1d_v2f16_tfe: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_sample_c_d_1d_v2f16_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) %tex.vec = extractvalue {<2 x half>, i32} %tex, 0 @@ -313,20 +321,33 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: image_sample_b_2d_v3f16_tfe: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v2 -; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 -; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: image_sample_b_2d_v3f16_tfe: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_sample_b_2d_v3f16_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s12, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) %tex.vec = extractvalue {<3 x half>, i32} %tex, 0 @@ -438,20 +459,33 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: image_sample_b_2d_v4f16_tfe: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v2 -; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 -; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: image_sample_b_2d_v4f16_tfe: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_sample_b_2d_v4f16_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s12, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) %tex.vec = extractvalue {<4 x half>, i32} %tex, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -104,17 +104,14 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: s_mov_b32 s14, exec_lo ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, v6 -; GFX11-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-NEXT: v_mov_b32_e32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -344,18 +341,30 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_12: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 -; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: sample_1d_tfe_adjust_writemask_12: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_1d_tfe_adjust_writemask_12: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s12, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -396,18 +405,30 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_24: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 -; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: sample_1d_tfe_adjust_writemask_24: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_1d_tfe_adjust_writemask_24: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s12, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -546,17 +567,14 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: s_mov_b32 s14, exec_lo ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, v6 -; GFX11-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-NEXT: v_mov_b32_e32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1614,8 +1632,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v11, 0 ; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v9, v11 -; GFX11-NEXT: v_mov_b32_e32 v10, v12 +; GFX11-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12 ; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v9 @@ -1678,17 +1695,28 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v11 ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: sample_c_d_o_2darray_V2_tfe: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: v_mov_b32_e32 v9, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v10, v9 -; GFX10PLUS-NEXT: v_mov_b32_e32 v11, v9 -; GFX10PLUS-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v9 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v10 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v11 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: sample_c_d_o_2darray_V2_tfe: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, v9 +; GFX10-NEXT: v_mov_b32_e32 v11, v9 +; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v9 +; GFX10-NEXT: v_mov_b32_e32 v1, v10 +; GFX10-NEXT: v_mov_b32_e32 v2, v11 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_c_d_o_2darray_V2_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v11, v9 +; GFX11-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %v.vec = extractvalue {<2 x float>, i32} %v, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -42,9 +42,8 @@ ; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15 ; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15 ; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 ; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -62,19 +62,16 @@ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_lshr_b32 s2, s7, 16 ; GFX11-NEXT: s_lshr_b32 s3, s5, 16 -; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_pack_ll_b32_b16 s4, s6, s8 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: v_mov_b32_e32 v4, s2 -; GFX11-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-NEXT: v_mov_b32_e32 v6, s0 -; GFX11-NEXT: v_mov_b32_e32 v7, s1 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4 ; GFX11-NEXT: s_mov_b32 s15, s12 ; GFX11-NEXT: s_mov_b32 s14, s11 ; GFX11-NEXT: s_mov_b32 s13, s10 @@ -137,20 +134,17 @@ ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-NEXT: v_mov_b32_e32 v6, s0 -; GFX11-NEXT: s_lshr_b32 s0, s8, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1 ; GFX11-NEXT: s_lshr_b32 s3, s6, 16 -; GFX11-NEXT: v_mov_b32_e32 v7, s1 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s6, s8 -; GFX11-NEXT: s_pack_ll_b32_b16 s3, s7, s9 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: v_mov_b32_e32 v2, s5 -; GFX11-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-NEXT: v_mov_b32_e32 v4, s0 -; GFX11-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: s_lshr_b32 s0, s8, 16 ; GFX11-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s7, s9 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: s_mov_b32 s15, s13 ; GFX11-NEXT: s_mov_b32 s14, s12 ; GFX11-NEXT: s_mov_b32 s13, s11 @@ -226,11 +220,9 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 0x40a00000 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -323,20 +315,19 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -422,14 +413,10 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 -; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 -; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 0x40a00000 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7 -; GFX11-NEXT: v_mov_b32_e32 v10, 0x102 +; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 +; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 +; GFX11-NEXT: v_dual_mov_b32 v10, 0x102 :: v_dual_mov_b32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -513,12 +500,9 @@ ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 -; GFX11-NEXT: v_mov_b32_e32 v7, 0x102 +; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 0x102 +; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -1,7 +1,7 @@ ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; GFX10PLUS-LABEL: {{^}}dpp8_test: ; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -11,9 +11,8 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -28,9 +27,8 @@ ; GFX11-LABEL: test_i: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -7,9 +7,8 @@ ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -18,9 +17,8 @@ ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -34,9 +32,8 @@ ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -45,9 +42,8 @@ ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -63,8 +59,7 @@ ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -80,8 +75,7 @@ ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -95,9 +89,8 @@ ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -106,9 +99,8 @@ ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -124,8 +116,7 @@ ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -139,9 +130,8 @@ ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -150,9 +140,8 @@ ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -168,8 +157,7 @@ ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -81,9 +81,8 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0 @@ -226,9 +225,8 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0 @@ -251,19 +249,18 @@ ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 ; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -558,23 +555,21 @@ ; GFX11-NEXT: s_addc_u32 s6, 0, s6 ; GFX11-NEXT: s_sub_u32 s9, s4, s2 ; GFX11-NEXT: s_subb_u32 s10, s6, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v0, s10 ; GFX11-NEXT: s_cmp_lt_i32 s1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s10 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_lt_i32 s3, 0 ; GFX11-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_add_i32 s1, s8, s7 ; GFX11-NEXT: s_mul_i32 s0, s0, s2 ; GFX11-NEXT: s_add_i32 s1, s1, s5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v1, v0, v1 :: v_dual_cndmask_b32 v0, v2, v3 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s5, s4 @@ -641,9 +636,8 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5] ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -696,15 +690,12 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] ; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -32,8 +32,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -70,8 +69,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -108,8 +106,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -146,8 +143,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -254,8 +250,7 @@ ; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8] -; GFX11-NEXT: v_mov_b32_e32 v7, v10 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10] ; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0 @@ -310,8 +305,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -405,8 +399,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5 @@ -494,14 +487,12 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] -; GFX11-NEXT: v_and_b32_e32 v5, 1, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] ; GFX11-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 8589934591 @@ -550,10 +541,9 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] -; GFX11-NEXT: v_and_b32_e32 v4, 1, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -631,9 +621,8 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %tmp4 = lshr i64 %arg0, 32 %tmp5 = and i64 %arg0, 4294967295 @@ -697,14 +686,13 @@ ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s6, s2, s3 ; GFX11-NEXT: s_mul_hi_u32 s3, s2, s3 ; GFX11-NEXT: s_add_u32 s2, s6, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -935,9 +923,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5] ; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3 ; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2 diff --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -amdgpu-enable-delay-alu=0 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s ; RUN: llc -march=amdgcn -mcpu=gfx1030 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX10 %s ; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -mattr=+wavefrontsize64 --verify-machineinstrs < %s | FileCheck --check-prefixes=GCN,GFX11 %s @@ -15,13 +15,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, v[2:3] ; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: mad_i32_vvv: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v3, v[2:3] -; GFX11-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -51,13 +44,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, 42 ; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: mad_i32_vvc: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 42 -; GFX11-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, 42 %cast = bitcast i32 %add to float @@ -76,13 +62,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, 0x12d687 ; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: mad_i32_vvi: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0x12d687 -; GFX11-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, 1234567 %cast = bitcast i32 %add to float @@ -168,13 +147,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, s[0:1] ; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: mad_i32_vvs: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, s[0:1] -; GFX11-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -119,11 +119,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -133,11 +131,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -258,11 +254,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -272,11 +266,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -406,14 +398,12 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -422,14 +412,12 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -568,16 +556,14 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -586,16 +572,14 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -699,8 +683,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -712,8 +695,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -817,8 +799,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -830,8 +811,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -947,8 +927,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -962,8 +941,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1081,8 +1059,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1096,8 +1073,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1203,8 +1179,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1216,8 +1191,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1340,8 +1314,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1356,8 +1329,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1476,8 +1448,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1491,8 +1462,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1629,8 +1599,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1647,8 +1616,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1788,8 +1756,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1806,8 +1773,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1941,8 +1907,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1958,8 +1923,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2105,8 +2069,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2124,8 +2087,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2273,8 +2235,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2292,8 +2253,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2416,10 +2376,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2430,10 +2388,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2568,10 +2524,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2585,10 +2539,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2719,10 +2671,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2735,10 +2685,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2887,10 +2835,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2906,10 +2852,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3061,10 +3005,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3080,10 +3022,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3223,10 +3163,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3240,10 +3178,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3381,10 +3317,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3398,10 +3332,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3551,10 +3483,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3570,10 +3500,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3725,10 +3653,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3744,10 +3670,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3899,10 +3823,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3918,10 +3840,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4073,10 +3993,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4092,10 +4010,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4247,10 +4163,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4266,10 +4180,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4421,10 +4333,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4440,10 +4350,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4595,10 +4503,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4614,10 +4520,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4769,10 +4673,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4788,10 +4690,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4936,10 +4836,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4952,10 +4850,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5108,10 +5004,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5126,10 +5020,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5287,10 +5179,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5305,10 +5195,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5475,10 +5363,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5495,10 +5381,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5667,10 +5551,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5687,10 +5569,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5847,10 +5727,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5865,10 +5743,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6023,10 +5899,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6041,10 +5915,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6211,10 +6083,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6231,10 +6101,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6403,10 +6271,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6423,10 +6289,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6595,10 +6459,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6615,10 +6477,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6787,10 +6647,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6807,10 +6665,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6979,10 +6835,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6999,10 +6853,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7171,10 +7023,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7191,10 +7041,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7363,10 +7211,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7383,10 +7229,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7555,10 +7399,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7575,10 +7417,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7706,11 +7546,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7720,11 +7558,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7845,11 +7681,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7859,11 +7693,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7999,14 +7831,12 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8016,14 +7846,12 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8169,16 +7997,14 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8188,16 +8014,14 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8302,8 +8126,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8315,8 +8138,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8420,8 +8242,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8433,8 +8254,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8550,8 +8370,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8565,8 +8384,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8684,8 +8502,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8699,8 +8516,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8806,8 +8622,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8819,8 +8634,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8941,8 +8755,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8956,8 +8769,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9075,8 +8887,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9090,8 +8901,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9226,8 +9036,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9243,8 +9052,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9381,8 +9189,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9398,8 +9205,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9537,8 +9343,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -9555,8 +9360,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -9708,8 +9512,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9728,8 +9531,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9883,8 +9685,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9903,8 +9704,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -10028,10 +9828,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10042,10 +9840,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10178,10 +9974,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10194,10 +9988,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10327,10 +10119,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10343,10 +10133,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10493,10 +10281,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10511,10 +10297,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10663,10 +10447,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10681,10 +10463,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10821,10 +10601,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10837,10 +10615,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10975,10 +10751,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10991,10 +10765,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11141,10 +10913,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11159,10 +10929,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11311,10 +11079,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11329,10 +11095,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11481,10 +11245,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11499,10 +11261,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11651,10 +11411,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11669,10 +11427,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11821,10 +11577,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11839,10 +11593,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11991,10 +11743,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12009,10 +11759,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12161,10 +11909,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12179,10 +11925,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12331,10 +12075,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12349,10 +12091,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12496,10 +12236,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12512,10 +12250,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12674,10 +12410,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12693,10 +12427,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -12855,10 +12587,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12873,10 +12603,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13049,10 +12777,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13070,10 +12796,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13249,10 +12973,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13270,10 +12992,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13437,10 +13157,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13456,10 +13174,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13621,10 +13337,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13640,10 +13354,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13817,10 +13529,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13838,10 +13548,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14017,10 +13725,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14038,10 +13744,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14217,10 +13921,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14238,10 +13940,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14417,10 +14117,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14438,10 +14136,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14617,10 +14313,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14638,10 +14332,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14817,10 +14509,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14838,10 +14528,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15017,10 +14705,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15038,10 +14724,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15217,10 +14901,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15238,10 +14920,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -119,11 +119,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -133,11 +131,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -275,8 +271,7 @@ ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -291,8 +286,7 @@ ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -415,11 +409,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -429,11 +421,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -565,11 +555,10 @@ ; GFX11-WGP-LABEL: flat_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 @@ -581,11 +570,10 @@ ; GFX11-CU-LABEL: flat_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -119,11 +119,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -133,11 +131,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -258,11 +254,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -272,11 +266,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -397,11 +389,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -411,11 +401,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -536,11 +524,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -550,11 +536,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -659,8 +643,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -672,8 +655,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -777,8 +759,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -790,8 +771,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -895,8 +875,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -908,8 +887,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1013,8 +991,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1026,8 +1003,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1131,8 +1107,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1144,8 +1119,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1249,8 +1223,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1262,8 +1235,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1367,8 +1339,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1380,8 +1351,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1485,8 +1455,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1498,8 +1467,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1603,8 +1571,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1616,8 +1583,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1737,8 +1703,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1752,8 +1717,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1876,8 +1840,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1891,8 +1854,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2015,8 +1977,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2030,8 +1991,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2150,10 +2110,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2164,10 +2122,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2283,10 +2239,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2297,10 +2251,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2416,10 +2368,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2430,10 +2380,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2549,10 +2497,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2563,10 +2509,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2682,10 +2626,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2696,10 +2638,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2815,10 +2755,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2829,10 +2767,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2948,10 +2884,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2962,10 +2896,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3081,10 +3013,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3095,10 +3025,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3214,10 +3142,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3228,10 +3154,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3347,10 +3271,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3361,10 +3283,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3480,10 +3400,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3494,10 +3412,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3613,10 +3529,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3627,10 +3541,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3746,10 +3658,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3760,10 +3670,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3879,10 +3787,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3893,10 +3799,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -4012,10 +3916,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -4026,10 +3928,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -4169,10 +4069,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4185,10 +4083,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4332,10 +4228,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4348,10 +4242,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4495,10 +4387,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4511,10 +4401,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4658,10 +4546,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4674,10 +4560,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4821,10 +4705,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4837,10 +4719,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4984,10 +4864,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5000,10 +4878,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5147,10 +5023,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5163,10 +5037,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5310,10 +5182,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5326,10 +5196,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5473,10 +5341,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5489,10 +5355,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5636,10 +5500,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5652,10 +5514,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5799,10 +5659,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5815,10 +5673,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5962,10 +5818,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5978,10 +5832,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6125,10 +5977,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -6141,10 +5991,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6288,10 +6136,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -6304,10 +6150,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6451,10 +6295,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -6467,10 +6309,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6594,11 +6434,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6608,11 +6446,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6733,11 +6569,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6747,11 +6581,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6872,11 +6704,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6886,11 +6716,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7011,11 +6839,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7025,11 +6851,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7134,8 +6958,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7147,8 +6970,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7252,8 +7074,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7265,8 +7086,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7370,8 +7190,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7383,8 +7202,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7488,8 +7306,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7501,8 +7318,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7606,8 +7422,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7619,8 +7434,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7724,8 +7538,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7737,8 +7550,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7842,8 +7654,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7855,8 +7666,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7960,8 +7770,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7973,8 +7782,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8078,8 +7886,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8091,8 +7898,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8212,8 +8018,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8227,8 +8032,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8351,8 +8155,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8366,8 +8169,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8490,8 +8292,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8505,8 +8306,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8625,10 +8425,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8639,10 +8437,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8758,10 +8554,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8772,10 +8566,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8891,10 +8683,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8905,10 +8695,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9024,10 +8812,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9038,10 +8824,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9157,10 +8941,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9171,10 +8953,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9290,10 +9070,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9304,10 +9082,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9423,10 +9199,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9437,10 +9211,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9556,10 +9328,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9570,10 +9340,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9689,10 +9457,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9703,10 +9469,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9822,10 +9586,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9836,10 +9598,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9955,10 +9715,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9969,10 +9727,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10088,10 +9844,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10102,10 +9856,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10221,10 +9973,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10235,10 +9985,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10354,10 +10102,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10368,10 +10114,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10487,10 +10231,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10501,10 +10243,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10644,10 +10384,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10660,10 +10398,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10807,10 +10543,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10823,10 +10557,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10970,10 +10702,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10986,10 +10716,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11133,10 +10861,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11149,10 +10875,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11296,10 +11020,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11312,10 +11034,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11459,10 +11179,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11475,10 +11193,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11622,10 +11338,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11638,10 +11352,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11785,10 +11497,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11801,10 +11511,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11948,10 +11656,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11964,10 +11670,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12111,10 +11815,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12127,10 +11829,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12274,10 +11974,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12290,10 +11988,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12437,10 +12133,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12453,10 +12147,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12600,10 +12292,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12616,10 +12306,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12763,10 +12451,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12779,10 +12465,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12926,10 +12610,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12942,10 +12624,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -119,11 +119,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -133,11 +131,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -258,11 +254,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -272,11 +266,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -408,14 +400,12 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -424,14 +414,12 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -572,16 +560,14 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -590,16 +576,14 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -703,8 +687,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -716,8 +699,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -821,8 +803,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -834,8 +815,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -953,8 +933,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -968,8 +947,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1089,8 +1067,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1104,8 +1081,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1211,8 +1187,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1224,8 +1199,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1350,8 +1324,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1366,8 +1339,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1488,8 +1460,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1503,8 +1474,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1645,8 +1615,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1663,8 +1632,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1808,8 +1776,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1826,8 +1793,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1963,8 +1929,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1980,8 +1945,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2131,8 +2095,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2150,8 +2113,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2303,8 +2265,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2322,8 +2283,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2446,10 +2406,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2460,10 +2418,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2600,10 +2556,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2617,10 +2571,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2753,10 +2705,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2769,10 +2719,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2925,10 +2873,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2944,10 +2890,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3103,10 +3047,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3122,10 +3064,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3267,10 +3207,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3284,10 +3222,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3427,10 +3363,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3444,10 +3378,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3601,10 +3533,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3620,10 +3550,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3779,10 +3707,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3798,10 +3724,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3957,10 +3881,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3976,10 +3898,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4135,10 +4055,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4154,10 +4072,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4313,10 +4229,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4332,10 +4246,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4491,10 +4403,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4510,10 +4420,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4669,10 +4577,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4688,10 +4594,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4847,10 +4751,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4866,10 +4768,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -5014,10 +4914,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5030,10 +4928,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5188,10 +5084,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5206,10 +5100,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5369,10 +5261,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5387,10 +5277,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5561,10 +5449,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5581,10 +5467,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5757,10 +5641,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5777,10 +5659,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5939,10 +5819,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5957,10 +5835,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6117,10 +5993,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6135,10 +6009,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6309,10 +6181,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6329,10 +6199,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6505,10 +6373,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6525,10 +6391,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6701,10 +6565,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6721,10 +6583,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6897,10 +6757,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6917,10 +6775,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7093,10 +6949,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7113,10 +6967,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7289,10 +7141,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7309,10 +7159,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7485,10 +7333,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7505,10 +7351,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7681,10 +7525,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7701,10 +7543,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7832,11 +7672,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7846,11 +7684,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7971,11 +7807,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7985,11 +7819,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8127,14 +7959,12 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8144,14 +7974,12 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8299,16 +8127,14 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8318,16 +8144,14 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8432,8 +8256,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8445,8 +8268,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8550,8 +8372,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8563,8 +8384,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8682,8 +8502,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8697,8 +8516,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8818,8 +8636,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8833,8 +8650,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8940,8 +8756,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8953,8 +8768,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -9077,8 +8891,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9092,8 +8905,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9213,8 +9025,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9228,8 +9039,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9368,8 +9178,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9385,8 +9194,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9527,8 +9335,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9544,8 +9351,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9685,8 +9491,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -9703,8 +9508,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -9860,8 +9664,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9880,8 +9683,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -10039,8 +9841,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -10059,8 +9860,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -10184,10 +9984,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10198,10 +9996,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10336,10 +10132,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10352,10 +10146,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10487,10 +10279,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10503,10 +10293,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10657,10 +10445,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10675,10 +10461,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10831,10 +10615,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10849,10 +10631,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10991,10 +10771,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11007,10 +10785,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11147,10 +10923,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11163,10 +10937,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11317,10 +11089,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11335,10 +11105,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11491,10 +11259,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11509,10 +11275,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11665,10 +11429,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11683,10 +11445,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11839,10 +11599,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11857,10 +11615,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12013,10 +11769,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12031,10 +11785,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12187,10 +11939,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12205,10 +11955,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12361,10 +12109,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12379,10 +12125,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12535,10 +12279,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12553,10 +12295,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12700,10 +12440,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12716,10 +12454,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12880,10 +12616,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12899,10 +12633,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13063,10 +12795,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13081,10 +12811,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13261,10 +12989,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13282,10 +13008,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13465,10 +13189,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13486,10 +13208,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13655,10 +13375,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13674,10 +13392,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13841,10 +13557,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13860,10 +13574,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -14041,10 +13753,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14062,10 +13772,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14245,10 +13953,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14266,10 +13972,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14449,10 +14153,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14470,10 +14172,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14653,10 +14353,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14674,10 +14372,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14857,10 +14553,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14878,10 +14572,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15061,10 +14753,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15082,10 +14772,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15265,10 +14953,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15286,10 +14972,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15469,10 +15153,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15490,10 +15172,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -67,12 +67,10 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -82,12 +80,10 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -172,8 +168,7 @@ ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -189,8 +184,7 @@ ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -265,11 +259,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -280,11 +272,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -363,11 +353,10 @@ ; GFX11-WGP-LABEL: flat_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 @@ -380,11 +369,10 @@ ; GFX11-CU-LABEL: flat_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 @@ -463,13 +451,11 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -478,12 +464,10 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -553,8 +537,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -568,8 +551,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -119,11 +119,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -133,11 +131,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -258,11 +254,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -272,11 +266,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -397,11 +389,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -411,11 +401,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -536,11 +524,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -550,11 +536,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -659,8 +643,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -672,8 +655,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -777,8 +759,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -790,8 +771,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -895,8 +875,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -908,8 +887,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1013,8 +991,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1026,8 +1003,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1131,8 +1107,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1144,8 +1119,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1249,8 +1223,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1262,8 +1235,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1367,8 +1339,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1380,8 +1351,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1485,8 +1455,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1498,8 +1467,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1603,8 +1571,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1616,8 +1583,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1737,8 +1703,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1752,8 +1717,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1876,8 +1840,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1891,8 +1854,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2015,8 +1977,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2030,8 +1991,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2150,10 +2110,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2164,10 +2122,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2283,10 +2239,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2297,10 +2251,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2416,10 +2368,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2430,10 +2380,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2549,10 +2497,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2563,10 +2509,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2682,10 +2626,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2696,10 +2638,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2815,10 +2755,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2829,10 +2767,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2948,10 +2884,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2962,10 +2896,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3081,10 +3013,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3095,10 +3025,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3214,10 +3142,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3228,10 +3154,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3347,10 +3271,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3361,10 +3283,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3480,10 +3400,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3494,10 +3412,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3613,10 +3529,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3627,10 +3541,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3746,10 +3658,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3760,10 +3670,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3879,10 +3787,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3893,10 +3799,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -4012,10 +3916,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -4026,10 +3928,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -4169,10 +4069,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4185,10 +4083,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4332,10 +4228,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4348,10 +4242,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4495,10 +4387,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4511,10 +4401,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4658,10 +4546,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4674,10 +4560,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4821,10 +4705,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4837,10 +4719,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4984,10 +4864,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5000,10 +4878,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5147,10 +5023,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5163,10 +5037,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5310,10 +5182,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5326,10 +5196,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5473,10 +5341,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5489,10 +5355,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5636,10 +5500,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5652,10 +5514,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5799,10 +5659,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5815,10 +5673,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5962,10 +5818,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -5978,10 +5832,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6125,10 +5977,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -6141,10 +5991,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6288,10 +6136,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -6304,10 +6150,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6451,10 +6295,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -6467,10 +6309,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -6594,11 +6434,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6608,11 +6446,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6733,11 +6569,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6747,11 +6581,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6872,11 +6704,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6886,11 +6716,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7011,11 +6839,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7025,11 +6851,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7134,8 +6958,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7147,8 +6970,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7252,8 +7074,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7265,8 +7086,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7370,8 +7190,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7383,8 +7202,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7488,8 +7306,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7501,8 +7318,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7606,8 +7422,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7619,8 +7434,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7724,8 +7538,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7737,8 +7550,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7842,8 +7654,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7855,8 +7666,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7960,8 +7770,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7973,8 +7782,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8078,8 +7886,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8091,8 +7898,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8212,8 +8018,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8227,8 +8032,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8351,8 +8155,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8366,8 +8169,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8490,8 +8292,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8505,8 +8306,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8625,10 +8425,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8639,10 +8437,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8758,10 +8554,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8772,10 +8566,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8891,10 +8683,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8905,10 +8695,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9024,10 +8812,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9038,10 +8824,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9157,10 +8941,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9171,10 +8953,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9290,10 +9070,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9304,10 +9082,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9423,10 +9199,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9437,10 +9211,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9556,10 +9328,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9570,10 +9340,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9689,10 +9457,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9703,10 +9469,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9822,10 +9586,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9836,10 +9598,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9955,10 +9715,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9969,10 +9727,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10088,10 +9844,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10102,10 +9856,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10221,10 +9973,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10235,10 +9985,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10354,10 +10102,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10368,10 +10114,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10487,10 +10231,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10501,10 +10243,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10644,10 +10384,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10660,10 +10398,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10807,10 +10543,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10823,10 +10557,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -10970,10 +10702,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10986,10 +10716,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11133,10 +10861,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11149,10 +10875,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11296,10 +11020,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11312,10 +11034,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11459,10 +11179,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11475,10 +11193,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11622,10 +11338,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11638,10 +11352,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11785,10 +11497,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11801,10 +11511,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11948,10 +11656,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -11964,10 +11670,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12111,10 +11815,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12127,10 +11829,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12274,10 +11974,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12290,10 +11988,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12437,10 +12133,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12453,10 +12147,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12600,10 +12292,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12616,10 +12306,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12763,10 +12451,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -12779,10 +12465,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -119,11 +119,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -133,11 +131,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -258,11 +254,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -272,11 +266,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -405,13 +397,11 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -420,12 +410,10 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -563,15 +551,13 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -580,13 +566,11 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -691,8 +675,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -704,8 +687,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -809,8 +791,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -822,8 +803,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -936,8 +916,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -951,8 +930,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -1066,8 +1044,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1081,8 +1058,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -1187,8 +1163,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1200,8 +1175,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1317,8 +1291,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1332,8 +1305,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1447,8 +1419,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1462,8 +1433,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -1589,8 +1559,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1606,8 +1575,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -1734,8 +1702,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1751,8 +1718,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 @@ -1877,8 +1843,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -1893,8 +1858,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -2029,8 +1993,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2047,8 +2010,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc @@ -2184,8 +2146,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2202,8 +2163,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc @@ -2323,10 +2283,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2337,10 +2295,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2468,10 +2424,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2484,10 +2438,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2613,10 +2565,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2629,10 +2579,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2770,10 +2718,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2788,10 +2734,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2930,10 +2874,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2948,10 +2890,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3081,10 +3021,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3097,10 +3035,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3229,10 +3165,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3245,10 +3179,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3386,10 +3318,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3404,10 +3334,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3546,10 +3474,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3564,10 +3490,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3706,10 +3630,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3724,10 +3646,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3866,10 +3786,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3884,10 +3802,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4029,10 +3945,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -4045,10 +3959,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4198,10 +4110,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4215,10 +4125,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -4371,10 +4279,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -4389,10 +4295,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4552,10 +4456,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -4571,10 +4473,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4734,10 +4634,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -4753,10 +4651,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -4907,10 +4803,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4924,10 +4818,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5077,10 +4969,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5094,10 +4984,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -5256,10 +5144,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5275,10 +5161,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5438,10 +5322,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5457,10 +5339,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5620,10 +5500,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5639,10 +5517,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5802,10 +5678,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5821,10 +5695,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -5984,10 +5856,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6003,10 +5873,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6166,10 +6034,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6185,10 +6051,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6348,10 +6212,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6367,10 +6229,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6530,10 +6390,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6549,10 +6407,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -6677,11 +6533,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6691,11 +6545,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6816,11 +6668,9 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6830,11 +6680,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6959,13 +6807,11 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -6975,11 +6821,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7108,15 +6952,13 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7126,11 +6968,9 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7235,8 +7075,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7248,8 +7087,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7353,8 +7191,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7366,8 +7203,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7475,8 +7311,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7490,8 +7325,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7599,8 +7433,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7614,8 +7447,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7719,8 +7551,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7732,8 +7563,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7843,8 +7673,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7857,8 +7686,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -7966,8 +7794,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7981,8 +7808,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8096,8 +7922,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8112,8 +7937,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8227,8 +8051,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8243,8 +8066,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -8368,8 +8190,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -8385,8 +8206,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8517,8 +8337,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8536,8 +8355,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8668,8 +8486,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8687,8 +8504,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) @@ -8807,10 +8623,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8821,10 +8635,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8946,10 +8758,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -8961,10 +8771,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9084,10 +8892,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9100,10 +8906,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9229,10 +9033,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9246,10 +9048,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9375,10 +9175,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9392,10 +9190,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9517,10 +9313,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9532,10 +9326,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9657,10 +9449,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9672,10 +9462,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9801,10 +9589,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9818,10 +9604,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9947,10 +9731,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9964,10 +9746,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10093,10 +9873,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10110,10 +9888,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10239,10 +10015,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10256,10 +10030,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10385,10 +10157,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10402,10 +10172,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10531,10 +10299,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10548,10 +10314,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10677,10 +10441,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10694,10 +10456,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10823,10 +10583,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10840,10 +10598,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10983,10 +10739,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 @@ -10999,10 +10753,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11150,10 +10902,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11168,10 +10918,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11319,10 +11067,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -11337,10 +11083,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11492,10 +11236,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -11512,10 +11254,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11667,10 +11407,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -11687,10 +11425,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -11838,10 +11574,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11856,10 +11590,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12007,10 +11739,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12025,10 +11755,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12180,10 +11908,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12200,10 +11926,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12355,10 +12079,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12375,10 +12097,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12530,10 +12250,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12550,10 +12268,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12705,10 +12421,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12725,10 +12439,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -12880,10 +12592,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12900,10 +12610,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -13055,10 +12763,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13075,10 +12781,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -13230,10 +12934,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13250,10 +12952,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 @@ -13405,10 +13105,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13425,10 +13123,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -675,9 +675,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -687,9 +686,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -800,9 +798,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -812,9 +809,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -938,9 +934,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -952,9 +947,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1080,9 +1074,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1094,9 +1087,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1209,9 +1201,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1221,9 +1212,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1353,9 +1343,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1367,9 +1356,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -1495,9 +1483,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1509,9 +1496,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1656,9 +1642,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1672,9 +1657,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1821,9 +1805,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1837,9 +1820,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1982,9 +1964,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1998,9 +1979,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2157,9 +2137,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2175,9 +2154,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2336,9 +2314,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2354,9 +2331,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2483,8 +2459,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2496,8 +2471,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2637,8 +2611,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2652,8 +2625,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2789,8 +2761,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2804,8 +2775,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2960,8 +2930,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2977,8 +2946,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3135,8 +3103,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3152,8 +3119,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3297,8 +3263,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3312,8 +3277,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3455,8 +3419,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3470,8 +3433,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3626,8 +3588,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3643,8 +3604,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3801,8 +3761,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3818,8 +3777,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3976,8 +3934,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3993,8 +3950,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4151,8 +4107,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4168,8 +4123,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4326,8 +4280,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4343,8 +4296,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4501,8 +4453,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4518,8 +4469,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4676,8 +4626,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4693,8 +4642,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4851,8 +4799,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4868,8 +4815,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -5014,8 +4960,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5029,8 +4974,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5185,8 +5129,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5202,8 +5145,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5363,8 +5305,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5380,8 +5321,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5551,8 +5491,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5570,8 +5509,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5743,8 +5681,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5762,8 +5699,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5922,8 +5858,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5939,8 +5874,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6097,8 +6031,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -6114,8 +6047,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -6285,8 +6217,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6304,8 +6235,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6477,8 +6407,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6496,8 +6425,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6669,8 +6597,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6688,8 +6615,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6861,8 +6787,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6880,8 +6805,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7053,8 +6977,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7072,8 +6995,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7245,8 +7167,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7264,8 +7185,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7437,8 +7357,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7456,8 +7375,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7629,8 +7547,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7648,8 +7565,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -8332,9 +8248,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8344,9 +8259,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8457,9 +8371,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8469,9 +8382,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8595,9 +8507,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8609,9 +8520,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8737,9 +8647,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8751,9 +8660,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8866,9 +8774,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8878,9 +8785,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9010,9 +8916,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9024,9 +8929,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -9152,9 +9056,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -9166,9 +9069,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -9313,9 +9215,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -9329,9 +9230,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -9478,9 +9378,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -9494,9 +9393,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -9639,9 +9537,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9655,9 +9552,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -9814,9 +9710,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9832,9 +9727,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9993,9 +9887,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -10011,9 +9904,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -10140,8 +10032,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10153,8 +10044,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10294,8 +10184,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10309,8 +10198,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10446,8 +10334,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10461,8 +10348,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10617,8 +10503,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10634,8 +10519,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10792,8 +10676,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10809,8 +10692,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10954,8 +10836,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10969,8 +10850,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11112,8 +10992,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11127,8 +11006,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11283,8 +11161,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11300,8 +11177,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11458,8 +11334,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11475,8 +11350,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11633,8 +11507,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11650,8 +11523,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11808,8 +11680,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11825,8 +11696,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11983,8 +11853,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12000,8 +11869,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12158,8 +12026,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12175,8 +12042,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12333,8 +12199,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12350,8 +12215,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12508,8 +12372,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12525,8 +12388,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12671,8 +12533,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12686,8 +12547,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12842,8 +12702,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12859,8 +12718,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13030,8 +12888,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13049,8 +12906,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13222,8 +13078,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13241,8 +13096,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13401,8 +13255,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13418,8 +13271,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13576,8 +13428,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13593,8 +13444,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13764,8 +13614,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13783,8 +13632,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13956,8 +13804,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13975,8 +13822,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14148,8 +13994,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14167,8 +14012,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14340,8 +14184,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14359,8 +14202,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14532,8 +14374,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14551,8 +14392,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14724,8 +14564,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14743,8 +14582,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14916,8 +14754,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14935,8 +14772,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -15108,8 +14944,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -15127,8 +14962,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -121,11 +121,10 @@ ; GFX11-WGP-LABEL: global_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -133,11 +132,10 @@ ; GFX11-CU-LABEL: global_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -267,8 +265,7 @@ ; GFX11-WGP-LABEL: global_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -279,8 +276,7 @@ ; GFX11-CU-LABEL: global_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -406,11 +402,10 @@ ; GFX11-WGP-LABEL: global_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -418,11 +413,10 @@ ; GFX11-CU-LABEL: global_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -547,11 +541,10 @@ ; GFX11-WGP-LABEL: global_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -559,11 +552,10 @@ ; GFX11-CU-LABEL: global_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -640,9 +640,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -652,9 +651,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -765,9 +763,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -777,9 +774,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -890,9 +886,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -902,9 +897,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1015,9 +1009,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1027,9 +1020,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1140,9 +1132,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1152,9 +1143,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1265,9 +1255,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1277,9 +1266,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1390,9 +1378,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1402,9 +1389,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1515,9 +1501,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1527,9 +1512,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1640,9 +1624,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1652,9 +1635,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1783,9 +1765,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1797,9 +1778,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1931,9 +1911,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1945,9 +1924,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2079,9 +2057,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2093,9 +2070,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2218,8 +2194,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2231,8 +2206,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2353,8 +2327,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2366,8 +2339,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2488,8 +2460,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2501,8 +2472,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2623,8 +2593,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2636,8 +2605,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2758,8 +2726,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2771,8 +2738,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2893,8 +2859,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2906,8 +2871,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3028,8 +2992,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3041,8 +3004,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3163,8 +3125,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3176,8 +3137,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3298,8 +3258,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3311,8 +3270,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3433,8 +3391,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3446,8 +3403,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3568,8 +3524,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3581,8 +3536,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3703,8 +3657,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3716,8 +3669,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3838,8 +3790,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3851,8 +3802,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3973,8 +3923,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3986,8 +3935,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -4108,8 +4056,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -4121,8 +4068,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -4263,8 +4209,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4278,8 +4223,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4424,8 +4368,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4439,8 +4382,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4585,8 +4527,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4600,8 +4541,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4746,8 +4686,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4761,8 +4700,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4907,8 +4845,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4922,8 +4859,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5068,8 +5004,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5083,8 +5018,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5229,8 +5163,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5244,8 +5177,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5390,8 +5322,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5405,8 +5336,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5551,8 +5481,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5566,8 +5495,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5712,8 +5640,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5727,8 +5654,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5873,8 +5799,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5888,8 +5813,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6034,8 +5958,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6049,8 +5972,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6195,8 +6117,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6210,8 +6131,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6356,8 +6276,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6371,8 +6290,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6517,8 +6435,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6532,8 +6449,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -7177,9 +7093,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7189,9 +7104,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7302,9 +7216,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7314,9 +7227,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7427,9 +7339,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7439,9 +7350,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7552,9 +7462,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7564,9 +7473,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7677,9 +7585,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7689,9 +7596,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7802,9 +7708,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7814,9 +7719,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7927,9 +7831,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7939,9 +7842,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8052,9 +7954,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8064,9 +7965,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8177,9 +8077,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8189,9 +8088,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8320,9 +8218,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8334,9 +8231,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8468,9 +8364,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8482,9 +8377,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8616,9 +8510,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8630,9 +8523,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8755,8 +8647,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8768,8 +8659,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8890,8 +8780,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8903,8 +8792,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9025,8 +8913,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9038,8 +8925,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9160,8 +9046,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9173,8 +9058,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9295,8 +9179,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9308,8 +9191,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9430,8 +9312,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9443,8 +9324,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9565,8 +9445,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9578,8 +9457,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9700,8 +9578,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9713,8 +9590,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9835,8 +9711,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9848,8 +9723,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9970,8 +9844,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9983,8 +9856,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10105,8 +9977,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10118,8 +9989,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10240,8 +10110,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10253,8 +10122,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10375,8 +10243,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10388,8 +10255,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10510,8 +10376,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10523,8 +10388,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10645,8 +10509,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10658,8 +10521,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10800,8 +10662,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -10815,8 +10676,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -10961,8 +10821,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -10976,8 +10835,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11122,8 +10980,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11137,8 +10994,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11283,8 +11139,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11298,8 +11153,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11444,8 +11298,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11459,8 +11312,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11605,8 +11457,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11620,8 +11471,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11766,8 +11616,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11781,8 +11630,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11927,8 +11775,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11942,8 +11789,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12088,8 +11934,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12103,8 +11948,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12249,8 +12093,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12264,8 +12107,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12410,8 +12252,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12425,8 +12266,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12571,8 +12411,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12586,8 +12425,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12732,8 +12570,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12747,8 +12584,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12893,8 +12729,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12908,8 +12743,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -13054,8 +12888,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -13069,8 +12902,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -679,9 +679,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -691,9 +690,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -804,9 +802,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -816,9 +813,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -944,9 +940,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -958,9 +953,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1088,9 +1082,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1102,9 +1095,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1217,9 +1209,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1229,9 +1220,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1363,9 +1353,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1377,9 +1366,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -1507,9 +1495,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1521,9 +1508,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1672,9 +1658,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1688,9 +1673,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1841,9 +1825,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1857,9 +1840,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -2004,9 +1986,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2020,9 +2001,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2183,9 +2163,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2201,9 +2180,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2366,9 +2344,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2384,9 +2361,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2513,8 +2489,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2526,8 +2501,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2669,8 +2643,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2684,8 +2657,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2823,8 +2795,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2838,8 +2809,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2998,8 +2968,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3015,8 +2984,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3177,8 +3145,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3194,8 +3161,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3341,8 +3307,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3356,8 +3321,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3501,8 +3465,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3516,8 +3479,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3676,8 +3638,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3693,8 +3654,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3855,8 +3815,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3872,8 +3831,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4034,8 +3992,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4051,8 +4008,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4213,8 +4169,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4230,8 +4185,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4376,8 +4330,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4391,8 +4344,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4549,8 +4501,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4566,8 +4517,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -4741,8 +4691,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -4760,8 +4709,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -4937,8 +4885,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -4956,8 +4903,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5118,8 +5064,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5135,8 +5080,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5295,8 +5239,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5312,8 +5255,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -5487,8 +5429,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5506,8 +5447,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5683,8 +5623,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5702,8 +5641,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5879,8 +5817,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5898,8 +5835,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6075,8 +6011,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6094,8 +6029,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6271,8 +6205,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6290,8 +6223,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6467,8 +6399,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6486,8 +6417,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6663,8 +6593,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6682,8 +6611,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6859,8 +6787,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6878,8 +6805,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7566,9 +7492,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7578,9 +7503,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7691,9 +7615,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7703,9 +7626,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7831,9 +7753,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7845,9 +7766,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7975,9 +7895,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -7989,9 +7908,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8104,9 +8022,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8116,9 +8033,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8250,9 +8166,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -8264,9 +8179,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -8394,9 +8308,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8408,9 +8321,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8559,9 +8471,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8575,9 +8486,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8728,9 +8638,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8744,9 +8653,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8891,9 +8799,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -8907,9 +8814,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -9070,9 +8976,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9088,9 +8993,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9253,9 +9157,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9271,9 +9174,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9400,8 +9302,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9413,8 +9314,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9556,8 +9456,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9571,8 +9470,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -9710,8 +9608,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9725,8 +9622,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9885,8 +9781,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9902,8 +9797,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10064,8 +9958,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10081,8 +9974,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10228,8 +10120,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10243,8 +10134,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10388,8 +10278,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10403,8 +10292,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10563,8 +10451,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10580,8 +10467,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10742,8 +10628,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10759,8 +10644,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10921,8 +10805,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10938,8 +10821,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11100,8 +10982,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11117,8 +10998,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11279,8 +11159,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11296,8 +11175,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11458,8 +11336,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11475,8 +11352,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11637,8 +11513,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11654,8 +11529,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11816,8 +11690,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11833,8 +11706,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11979,8 +11851,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11994,8 +11865,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12152,8 +12022,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12169,8 +12038,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -12332,8 +12200,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12349,8 +12216,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12524,8 +12390,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12543,8 +12408,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12720,8 +12584,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12739,8 +12602,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12901,8 +12763,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12918,8 +12779,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13078,8 +12938,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -13095,8 +12954,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv @@ -13270,8 +13128,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13289,8 +13146,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13466,8 +13322,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13485,8 +13340,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13662,8 +13516,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13681,8 +13534,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13858,8 +13710,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13877,8 +13728,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14054,8 +13904,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14073,8 +13922,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14250,8 +14098,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14269,8 +14116,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14446,8 +14292,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14465,8 +14310,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14642,8 +14486,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14661,8 +14504,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -174,8 +174,7 @@ ; GFX11-WGP-LABEL: global_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -186,8 +185,7 @@ ; GFX11-CU-LABEL: global_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -274,11 +272,10 @@ ; GFX11-WGP-LABEL: global_volatile_store_0: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -287,11 +284,10 @@ ; GFX11-CU-LABEL: global_volatile_store_0: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -378,11 +374,10 @@ ; GFX11-WGP-LABEL: global_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -391,11 +386,10 @@ ; GFX11-CU-LABEL: global_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -570,9 +564,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -584,9 +577,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -640,9 +640,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -652,9 +651,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -765,9 +763,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -777,9 +774,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -890,9 +886,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -902,9 +897,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1015,9 +1009,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1027,9 +1020,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1140,9 +1132,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1152,9 +1143,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1265,9 +1255,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1277,9 +1266,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1390,9 +1378,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1402,9 +1389,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1515,9 +1501,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1527,9 +1512,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1640,9 +1624,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1652,9 +1635,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1783,9 +1765,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1797,9 +1778,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1931,9 +1911,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1945,9 +1924,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2079,9 +2057,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2093,9 +2070,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2218,8 +2194,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2231,8 +2206,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2353,8 +2327,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2366,8 +2339,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2488,8 +2460,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2501,8 +2472,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2623,8 +2593,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2636,8 +2605,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2758,8 +2726,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2771,8 +2738,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2893,8 +2859,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2906,8 +2871,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3028,8 +2992,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3041,8 +3004,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3163,8 +3125,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3176,8 +3137,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3298,8 +3258,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3311,8 +3270,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3433,8 +3391,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3446,8 +3403,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3568,8 +3524,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3581,8 +3536,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3703,8 +3657,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3716,8 +3669,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3838,8 +3790,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3851,8 +3802,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3973,8 +3923,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -3986,8 +3935,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -4108,8 +4056,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -4121,8 +4068,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -4263,8 +4209,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4278,8 +4223,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4424,8 +4368,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4439,8 +4382,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4585,8 +4527,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4600,8 +4541,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4746,8 +4686,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4761,8 +4700,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4907,8 +4845,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4922,8 +4859,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5068,8 +5004,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5083,8 +5018,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5229,8 +5163,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5244,8 +5177,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5390,8 +5322,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5405,8 +5336,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5551,8 +5481,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5566,8 +5495,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5712,8 +5640,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5727,8 +5654,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5873,8 +5799,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5888,8 +5813,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6034,8 +5958,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6049,8 +5972,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6195,8 +6117,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6210,8 +6131,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6356,8 +6276,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6371,8 +6290,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6517,8 +6435,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -6532,8 +6449,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -7177,9 +7093,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7189,9 +7104,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7302,9 +7216,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7314,9 +7227,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7427,9 +7339,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7439,9 +7350,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7552,9 +7462,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7564,9 +7473,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7677,9 +7585,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7689,9 +7596,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7802,9 +7708,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7814,9 +7719,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7927,9 +7831,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7939,9 +7842,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8052,9 +7954,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8064,9 +7965,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8177,9 +8077,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8189,9 +8088,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8320,9 +8218,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8334,9 +8231,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8468,9 +8364,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8482,9 +8377,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8616,9 +8510,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8630,9 +8523,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8755,8 +8647,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8768,8 +8659,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8890,8 +8780,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8903,8 +8792,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9025,8 +8913,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9038,8 +8925,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9160,8 +9046,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9173,8 +9058,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9295,8 +9179,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9308,8 +9191,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9430,8 +9312,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9443,8 +9324,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9565,8 +9445,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9578,8 +9457,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9700,8 +9578,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9713,8 +9590,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9835,8 +9711,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9848,8 +9723,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9970,8 +9844,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9983,8 +9856,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10105,8 +9977,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10118,8 +9989,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10240,8 +10110,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10253,8 +10122,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10375,8 +10243,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10388,8 +10255,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10510,8 +10376,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10523,8 +10388,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10645,8 +10509,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -10658,8 +10521,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10800,8 +10662,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -10815,8 +10676,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -10961,8 +10821,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -10976,8 +10835,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11122,8 +10980,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11137,8 +10994,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11283,8 +11139,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11298,8 +11153,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11444,8 +11298,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11459,8 +11312,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11605,8 +11457,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11620,8 +11471,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11766,8 +11616,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11781,8 +11630,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11927,8 +11775,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11942,8 +11789,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12088,8 +11934,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12103,8 +11948,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12249,8 +12093,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12264,8 +12107,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12410,8 +12252,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12425,8 +12266,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12571,8 +12411,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12586,8 +12425,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12732,8 +12570,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12747,8 +12584,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12893,8 +12729,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12908,8 +12743,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -13054,8 +12888,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -13069,8 +12902,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -653,9 +653,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -665,9 +664,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -778,9 +776,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -790,9 +787,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -913,9 +909,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -927,9 +922,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1051,9 +1045,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -1065,9 +1058,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1179,9 +1171,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -1191,9 +1182,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1310,9 +1300,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1323,9 +1312,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -1446,9 +1434,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1460,9 +1447,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1590,9 +1576,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1605,9 +1590,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1735,9 +1719,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1750,9 +1733,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -1885,9 +1867,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1900,9 +1881,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -2047,9 +2027,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2064,9 +2043,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -2212,9 +2190,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2229,9 +2206,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -2355,8 +2331,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -2368,8 +2343,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2496,8 +2470,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2510,8 +2483,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -2642,8 +2614,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2657,8 +2628,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2796,8 +2766,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2812,8 +2781,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -2951,8 +2919,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2967,8 +2934,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3096,8 +3062,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3110,8 +3075,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3238,8 +3202,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3252,8 +3215,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -3390,8 +3352,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3406,8 +3367,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3545,8 +3505,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3561,8 +3520,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3700,8 +3658,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3716,8 +3673,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -3855,8 +3811,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3871,8 +3826,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4010,8 +3964,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4026,8 +3979,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4165,8 +4117,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4181,8 +4132,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4320,8 +4270,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4336,8 +4285,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4475,8 +4423,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4491,8 +4438,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -4634,8 +4580,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4649,8 +4594,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4798,8 +4742,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -4814,8 +4757,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -4970,8 +4912,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -4987,8 +4928,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -5147,8 +5087,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5165,8 +5104,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -5325,8 +5263,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5343,8 +5280,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -5493,8 +5429,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5509,8 +5444,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5658,8 +5592,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -5674,8 +5607,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -5833,8 +5765,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5851,8 +5782,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -6011,8 +5941,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6029,8 +5958,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -6189,8 +6117,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6207,8 +6134,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -6367,8 +6293,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6385,8 +6310,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -6545,8 +6469,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6563,8 +6486,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -6723,8 +6645,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6741,8 +6662,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -6901,8 +6821,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6919,8 +6838,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -7079,8 +6997,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7097,8 +7014,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -7753,9 +7669,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7765,9 +7680,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -7878,9 +7792,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -7890,9 +7803,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8007,9 +7919,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8021,9 +7932,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8138,9 +8048,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] @@ -8152,9 +8061,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8265,9 +8173,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -8277,9 +8184,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8396,9 +8302,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -8409,9 +8314,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8526,9 +8430,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8540,9 +8443,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8663,9 +8565,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8678,9 +8579,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8801,9 +8701,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8816,9 +8715,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -8950,9 +8848,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -8965,9 +8862,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9106,9 +9002,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9123,9 +9018,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9264,9 +9158,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9281,9 +9174,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] @@ -9406,8 +9298,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -9419,8 +9310,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9547,8 +9437,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9561,8 +9450,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9687,8 +9575,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9702,8 +9589,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9834,8 +9720,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9850,8 +9735,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -9982,8 +9866,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9998,8 +9881,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10126,8 +10008,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10140,8 +10021,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10268,8 +10148,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10282,8 +10161,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10414,8 +10292,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10430,8 +10307,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10562,8 +10438,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10578,8 +10453,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10710,8 +10584,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10726,8 +10599,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -10858,8 +10730,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10874,8 +10745,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -11006,8 +10876,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11022,8 +10891,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -11154,8 +11022,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11170,8 +11037,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -11302,8 +11168,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11318,8 +11183,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -11450,8 +11314,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11466,8 +11329,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm @@ -11608,8 +11470,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11623,8 +11484,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11772,8 +11632,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11788,8 +11647,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -11938,8 +11796,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -11955,8 +11812,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12108,8 +11964,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12126,8 +11981,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12279,8 +12133,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12297,8 +12150,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12446,8 +12298,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12462,8 +12313,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12611,8 +12461,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -12627,8 +12476,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12780,8 +12628,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12798,8 +12645,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -12951,8 +12797,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12969,8 +12814,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -13122,8 +12966,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13140,8 +12983,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -13293,8 +13135,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13311,8 +13152,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -13464,8 +13304,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13482,8 +13321,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -13635,8 +13473,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13653,8 +13490,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -13806,8 +13642,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13824,8 +13659,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] @@ -13977,8 +13811,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13995,8 +13828,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -119,8 +119,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -130,8 +129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -251,8 +249,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -262,8 +259,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -386,8 +382,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -398,8 +393,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -532,8 +526,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 @@ -546,8 +539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -650,8 +642,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -659,8 +650,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -759,8 +749,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -768,8 +757,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -878,8 +866,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -889,8 +876,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1000,8 +986,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1011,8 +996,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1112,8 +1096,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1121,8 +1104,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1231,8 +1213,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1242,8 +1223,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -1353,8 +1333,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1364,8 +1343,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1485,8 +1463,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1498,8 +1475,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1620,8 +1596,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1633,8 +1608,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1756,8 +1730,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1768,8 +1741,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1902,8 +1874,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -1916,8 +1887,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2051,8 +2021,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -2065,8 +2034,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2178,8 +2146,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2188,8 +2155,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2309,8 +2275,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2321,8 +2286,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,8 +2407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2455,8 +2418,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2587,8 +2549,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2601,8 +2562,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2734,8 +2694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2748,8 +2707,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2871,8 +2829,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2883,8 +2840,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3005,8 +2961,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3017,8 +2972,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3149,8 +3103,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3163,8 +3116,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3296,8 +3248,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3310,8 +3261,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3443,8 +3393,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3457,8 +3406,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3590,8 +3538,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3604,8 +3551,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3737,8 +3683,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3751,8 +3696,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3884,8 +3828,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3898,8 +3841,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4031,8 +3973,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4045,8 +3986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4178,8 +4118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4192,8 +4131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4323,8 +4261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4335,8 +4272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4471,8 +4407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4484,8 +4419,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4627,8 +4561,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4641,8 +4574,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4788,8 +4720,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4803,8 +4734,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4950,8 +4880,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4965,8 +4894,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5102,8 +5030,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5115,8 +5042,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5251,8 +5177,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5264,8 +5189,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5410,8 +5334,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5425,8 +5348,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5572,8 +5494,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5587,8 +5508,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5734,8 +5654,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5749,8 +5668,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5896,8 +5814,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5911,8 +5828,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6058,8 +5974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6073,8 +5988,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6220,8 +6134,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6235,8 +6148,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6382,8 +6294,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6397,8 +6308,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6544,8 +6454,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6559,8 +6468,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6684,8 +6592,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6695,8 +6602,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6816,8 +6722,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6827,8 +6732,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6948,8 +6852,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6959,8 +6862,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7080,8 +6982,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -7091,8 +6992,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7194,8 +7094,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7203,8 +7102,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7303,8 +7201,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7312,8 +7209,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7412,8 +7308,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7421,8 +7316,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7521,8 +7415,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7530,8 +7423,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7630,8 +7522,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7639,8 +7530,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7739,8 +7629,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7748,8 +7637,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7848,8 +7736,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7857,8 +7744,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7957,8 +7843,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7966,8 +7851,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8066,8 +7950,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -8075,8 +7958,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8193,8 +8075,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8204,8 +8085,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8325,8 +8205,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8336,8 +8215,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8457,8 +8335,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8468,8 +8345,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8580,8 +8456,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8590,8 +8465,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8701,8 +8575,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8711,8 +8584,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8822,8 +8694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8832,8 +8703,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8943,8 +8813,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8953,8 +8822,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9064,8 +8932,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9074,8 +8941,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9185,8 +9051,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9195,8 +9060,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9306,8 +9170,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9316,8 +9179,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9427,8 +9289,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9437,8 +9298,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9548,8 +9408,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9558,8 +9417,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9669,8 +9527,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9679,8 +9536,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9790,8 +9646,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9800,8 +9655,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9911,8 +9765,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9921,8 +9774,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10032,8 +9884,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10042,8 +9893,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10153,8 +10003,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10163,8 +10012,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10274,8 +10122,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10284,8 +10131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10413,8 +10259,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10425,8 +10270,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10558,8 +10402,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10570,8 +10413,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10703,8 +10545,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10715,8 +10556,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10848,8 +10688,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10860,8 +10699,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10993,8 +10831,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11005,8 +10842,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11138,8 +10974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11150,8 +10985,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11283,8 +11117,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11295,8 +11128,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11428,8 +11260,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11440,8 +11271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11573,8 +11403,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11585,8 +11414,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11718,8 +11546,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11730,8 +11557,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11863,8 +11689,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11875,8 +11700,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12008,8 +11832,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12020,8 +11843,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12153,8 +11975,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12165,8 +11986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12298,8 +12118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12310,8 +12129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12443,8 +12261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12455,8 +12272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -133,9 +133,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] @@ -147,9 +146,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] @@ -439,9 +437,8 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -452,9 +449,8 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(3)* %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -119,8 +119,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -130,8 +129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -251,8 +249,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -262,8 +259,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -383,8 +379,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -394,8 +389,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -515,8 +509,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -526,8 +519,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -629,8 +621,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -638,8 +629,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -738,8 +728,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -747,8 +736,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -847,8 +835,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -856,8 +843,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -956,8 +942,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -965,8 +950,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -1065,8 +1049,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1074,8 +1057,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1174,8 +1156,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1183,8 +1164,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1283,8 +1263,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1292,8 +1271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1392,8 +1370,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1401,8 +1378,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1501,8 +1477,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1510,8 +1485,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1628,8 +1602,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1639,8 +1612,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1760,8 +1732,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1771,8 +1742,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1892,8 +1862,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1903,8 +1872,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -2015,8 +1983,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2025,8 +1992,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2136,8 +2102,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2146,8 +2111,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2257,8 +2221,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2267,8 +2230,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2378,8 +2340,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2388,8 +2349,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2499,8 +2459,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2509,8 +2468,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2620,8 +2578,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2630,8 +2587,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2741,8 +2697,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2751,8 +2706,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2862,8 +2816,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2872,8 +2825,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2983,8 +2935,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2993,8 +2944,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3104,8 +3054,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3114,8 +3063,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3225,8 +3173,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3235,8 +3182,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3346,8 +3292,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3356,8 +3301,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3467,8 +3411,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3477,8 +3420,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3588,8 +3530,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3598,8 +3539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3709,8 +3649,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3719,8 +3658,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3848,8 +3786,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3860,8 +3797,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3993,8 +3929,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4005,8 +3940,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4138,8 +4072,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4150,8 +4083,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4283,8 +4215,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4295,8 +4226,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4428,8 +4358,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4440,8 +4369,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4573,8 +4501,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4585,8 +4512,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4718,8 +4644,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4730,8 +4655,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4863,8 +4787,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,8 +4798,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5008,8 +4930,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5020,8 +4941,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5153,8 +5073,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5165,8 +5084,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5298,8 +5216,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5310,8 +5227,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5443,8 +5359,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5455,8 +5370,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5588,8 +5502,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,8 +5513,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5733,8 +5645,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5745,8 +5656,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5878,8 +5788,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5890,8 +5799,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6014,8 +5922,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6025,8 +5932,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6146,8 +6052,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6157,8 +6062,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6278,8 +6182,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6289,8 +6192,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6410,8 +6312,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6421,8 +6322,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6524,8 +6424,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6533,8 +6432,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6633,8 +6531,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6642,8 +6539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6742,8 +6638,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6751,8 +6646,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6851,8 +6745,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6860,8 +6753,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6960,8 +6852,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6969,8 +6860,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7069,8 +6959,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7078,8 +6967,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7178,8 +7066,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7187,8 +7074,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7287,8 +7173,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7296,8 +7181,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7396,8 +7280,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7405,8 +7288,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7523,8 +7405,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7534,8 +7415,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7655,8 +7535,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7666,8 +7545,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7787,8 +7665,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7798,8 +7675,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7910,8 +7786,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -7920,8 +7795,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8031,8 +7905,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8041,8 +7914,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8152,8 +8024,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8162,8 +8033,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8273,8 +8143,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8283,8 +8152,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8394,8 +8262,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8404,8 +8271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8515,8 +8381,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8525,8 +8390,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8636,8 +8500,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8646,8 +8509,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8757,8 +8619,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8767,8 +8628,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8878,8 +8738,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8888,8 +8747,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8999,8 +8857,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9009,8 +8866,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9120,8 +8976,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9130,8 +8985,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9241,8 +9095,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9251,8 +9104,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9362,8 +9214,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9372,8 +9223,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9483,8 +9333,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9493,8 +9342,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9604,8 +9452,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9614,8 +9461,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9743,8 +9589,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9755,8 +9600,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9888,8 +9732,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9900,8 +9743,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10033,8 +9875,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10045,8 +9886,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10178,8 +10018,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10190,8 +10029,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10323,8 +10161,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10335,8 +10172,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10468,8 +10304,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10480,8 +10315,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10613,8 +10447,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10625,8 +10458,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10758,8 +10590,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10770,8 +10601,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10903,8 +10733,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10915,8 +10744,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11048,8 +10876,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11060,8 +10887,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11193,8 +11019,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11205,8 +11030,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11338,8 +11162,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11350,8 +11173,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11483,8 +11305,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11495,8 +11316,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11628,8 +11448,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11640,8 +11459,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11773,8 +11591,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11785,8 +11602,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -119,8 +119,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -130,8 +129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -251,8 +249,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -262,8 +259,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -386,8 +382,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -398,8 +393,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -532,8 +526,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 @@ -546,8 +539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -650,8 +642,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -659,8 +650,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -759,8 +749,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -768,8 +757,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -878,8 +866,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -889,8 +876,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1000,8 +986,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1011,8 +996,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1112,8 +1096,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1121,8 +1104,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1231,8 +1213,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1242,8 +1223,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -1353,8 +1333,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1364,8 +1343,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1485,8 +1463,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1498,8 +1475,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1620,8 +1596,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1633,8 +1608,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1756,8 +1730,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1768,8 +1741,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1902,8 +1874,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -1916,8 +1887,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2051,8 +2021,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -2065,8 +2034,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2178,8 +2146,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2188,8 +2155,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2309,8 +2275,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2321,8 +2286,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,8 +2407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2455,8 +2418,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2587,8 +2549,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2601,8 +2562,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2734,8 +2694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2748,8 +2707,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2871,8 +2829,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2883,8 +2840,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3005,8 +2961,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3017,8 +2972,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3149,8 +3103,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3163,8 +3116,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3296,8 +3248,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3310,8 +3261,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3443,8 +3393,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3457,8 +3406,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3590,8 +3538,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3604,8 +3551,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3737,8 +3683,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3751,8 +3696,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3884,8 +3828,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3898,8 +3841,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4031,8 +3973,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4045,8 +3986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4178,8 +4118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4192,8 +4131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4323,8 +4261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4335,8 +4272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4471,8 +4407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4484,8 +4419,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4627,8 +4561,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4641,8 +4574,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4788,8 +4720,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4803,8 +4734,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4950,8 +4880,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4965,8 +4894,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5102,8 +5030,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5115,8 +5042,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5251,8 +5177,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5264,8 +5189,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5410,8 +5334,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5425,8 +5348,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5572,8 +5494,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5587,8 +5508,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5734,8 +5654,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5749,8 +5668,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5896,8 +5814,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5911,8 +5828,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6058,8 +5974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6073,8 +5988,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6220,8 +6134,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6235,8 +6148,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6382,8 +6294,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6397,8 +6308,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6544,8 +6454,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6559,8 +6468,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6684,8 +6592,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6695,8 +6602,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6816,8 +6722,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6827,8 +6732,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6948,8 +6852,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6959,8 +6862,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7080,8 +6982,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -7091,8 +6992,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7194,8 +7094,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7203,8 +7102,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7303,8 +7201,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7312,8 +7209,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7412,8 +7308,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7421,8 +7316,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7521,8 +7415,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7530,8 +7423,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7630,8 +7522,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7639,8 +7530,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7739,8 +7629,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7748,8 +7637,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7848,8 +7736,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7857,8 +7744,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7957,8 +7843,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7966,8 +7851,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8066,8 +7950,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -8075,8 +7958,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8193,8 +8075,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8204,8 +8085,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8325,8 +8205,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8336,8 +8215,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8457,8 +8335,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8468,8 +8345,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8580,8 +8456,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8590,8 +8465,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8701,8 +8575,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8711,8 +8584,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8822,8 +8694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8832,8 +8703,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8943,8 +8813,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8953,8 +8822,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9064,8 +8932,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9074,8 +8941,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9185,8 +9051,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9195,8 +9060,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9306,8 +9170,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9316,8 +9179,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9427,8 +9289,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9437,8 +9298,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9548,8 +9408,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9558,8 +9417,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9669,8 +9527,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9679,8 +9536,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9790,8 +9646,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9800,8 +9655,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9911,8 +9765,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9921,8 +9774,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10032,8 +9884,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10042,8 +9893,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10153,8 +10003,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10163,8 +10012,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10274,8 +10122,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10284,8 +10131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10413,8 +10259,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10425,8 +10270,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10558,8 +10402,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10570,8 +10413,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10703,8 +10545,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10715,8 +10556,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10848,8 +10688,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10860,8 +10699,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10993,8 +10831,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11005,8 +10842,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11138,8 +10974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11150,8 +10985,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11283,8 +11117,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11295,8 +11128,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11428,8 +11260,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11440,8 +11271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11573,8 +11403,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11585,8 +11414,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11718,8 +11546,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11730,8 +11557,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11863,8 +11689,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11875,8 +11700,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12008,8 +11832,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12020,8 +11843,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12153,8 +11975,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12165,8 +11986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12298,8 +12118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12310,8 +12129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12443,8 +12261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12455,8 +12272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -81,9 +81,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] @@ -95,9 +94,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] @@ -291,9 +289,8 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -304,9 +301,8 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(3)* %out) { @@ -484,8 +480,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -496,8 +491,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -569,8 +563,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -580,8 +573,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -119,8 +119,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -130,8 +129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -251,8 +249,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -262,8 +259,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -383,8 +379,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -394,8 +389,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -515,8 +509,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -526,8 +519,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -629,8 +621,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -638,8 +629,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -738,8 +728,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -747,8 +736,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -847,8 +835,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -856,8 +843,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -956,8 +942,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -965,8 +950,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -1065,8 +1049,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1074,8 +1057,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1174,8 +1156,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1183,8 +1164,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1283,8 +1263,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1292,8 +1271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1392,8 +1370,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1401,8 +1378,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1501,8 +1477,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1510,8 +1485,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1628,8 +1602,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1639,8 +1612,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1760,8 +1732,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1771,8 +1742,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1892,8 +1862,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1903,8 +1872,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -2015,8 +1983,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2025,8 +1992,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2136,8 +2102,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2146,8 +2111,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2257,8 +2221,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2267,8 +2230,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2378,8 +2340,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2388,8 +2349,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2499,8 +2459,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2509,8 +2468,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2620,8 +2578,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2630,8 +2587,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2741,8 +2697,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2751,8 +2706,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2862,8 +2816,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2872,8 +2825,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2983,8 +2935,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2993,8 +2944,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3104,8 +3054,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3114,8 +3063,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3225,8 +3173,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3235,8 +3182,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3346,8 +3292,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3356,8 +3301,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3467,8 +3411,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3477,8 +3420,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3588,8 +3530,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3598,8 +3539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3709,8 +3649,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3719,8 +3658,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3848,8 +3786,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3860,8 +3797,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3993,8 +3929,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4005,8 +3940,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4138,8 +4072,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4150,8 +4083,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4283,8 +4215,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4295,8 +4226,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4428,8 +4358,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4440,8 +4369,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4573,8 +4501,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4585,8 +4512,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4718,8 +4644,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4730,8 +4655,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4863,8 +4787,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,8 +4798,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5008,8 +4930,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5020,8 +4941,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5153,8 +5073,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5165,8 +5084,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5298,8 +5216,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5310,8 +5227,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5443,8 +5359,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5455,8 +5370,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5588,8 +5502,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,8 +5513,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5733,8 +5645,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5745,8 +5656,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5878,8 +5788,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5890,8 +5799,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6014,8 +5922,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6025,8 +5932,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6146,8 +6052,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6157,8 +6062,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6278,8 +6182,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6289,8 +6192,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6410,8 +6312,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6421,8 +6322,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6524,8 +6424,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6533,8 +6432,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6633,8 +6531,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6642,8 +6539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6742,8 +6638,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6751,8 +6646,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6851,8 +6745,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6860,8 +6753,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6960,8 +6852,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6969,8 +6860,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7069,8 +6959,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7078,8 +6967,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7178,8 +7066,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7187,8 +7074,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7287,8 +7173,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7296,8 +7181,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7396,8 +7280,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7405,8 +7288,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7523,8 +7405,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7534,8 +7415,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7655,8 +7535,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7666,8 +7545,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7787,8 +7665,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7798,8 +7675,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7910,8 +7786,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -7920,8 +7795,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8031,8 +7905,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8041,8 +7914,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8152,8 +8024,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8162,8 +8033,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8273,8 +8143,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8283,8 +8152,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8394,8 +8262,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8404,8 +8271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8515,8 +8381,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8525,8 +8390,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8636,8 +8500,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8646,8 +8509,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8757,8 +8619,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8767,8 +8628,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8878,8 +8738,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8888,8 +8747,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8999,8 +8857,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9009,8 +8866,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9120,8 +8976,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9130,8 +8985,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9241,8 +9095,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9251,8 +9104,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9362,8 +9214,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9372,8 +9223,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9483,8 +9333,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9493,8 +9342,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9604,8 +9452,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9614,8 +9461,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9743,8 +9589,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9755,8 +9600,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9888,8 +9732,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9900,8 +9743,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10033,8 +9875,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10045,8 +9886,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10178,8 +10018,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10190,8 +10029,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10323,8 +10161,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10335,8 +10172,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10468,8 +10304,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10480,8 +10315,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10613,8 +10447,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10625,8 +10458,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10758,8 +10590,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10770,8 +10601,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10903,8 +10733,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10915,8 +10744,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11048,8 +10876,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11060,8 +10887,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11193,8 +11019,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11205,8 +11030,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11338,8 +11162,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11350,8 +11173,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11483,8 +11305,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11495,8 +11316,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11628,8 +11448,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11640,8 +11459,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11773,8 +11591,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11785,8 +11602,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -119,8 +119,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -130,8 +129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -251,8 +249,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -262,8 +259,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -386,8 +382,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -398,8 +393,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -532,8 +526,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 @@ -546,8 +539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -650,8 +642,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -659,8 +650,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -759,8 +749,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -768,8 +757,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -878,8 +866,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -889,8 +876,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1000,8 +986,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1011,8 +996,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1112,8 +1096,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1121,8 +1104,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1231,8 +1213,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1242,8 +1223,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -1353,8 +1333,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1364,8 +1343,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1485,8 +1463,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1498,8 +1475,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1620,8 +1596,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1633,8 +1608,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1756,8 +1730,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1768,8 +1741,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1902,8 +1874,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -1916,8 +1887,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2051,8 +2021,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -2065,8 +2034,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2178,8 +2146,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2188,8 +2155,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2309,8 +2275,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2321,8 +2286,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,8 +2407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2455,8 +2418,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2587,8 +2549,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2601,8 +2562,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2734,8 +2694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2748,8 +2707,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2871,8 +2829,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2883,8 +2840,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3005,8 +2961,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3017,8 +2972,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3149,8 +3103,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3163,8 +3116,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3296,8 +3248,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3310,8 +3261,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3443,8 +3393,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3457,8 +3406,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3590,8 +3538,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3604,8 +3551,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3737,8 +3683,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3751,8 +3696,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3884,8 +3828,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3898,8 +3841,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4031,8 +3973,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4045,8 +3986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4178,8 +4118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4192,8 +4131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4323,8 +4261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4335,8 +4272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4471,8 +4407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4484,8 +4419,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4627,8 +4561,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4641,8 +4574,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4788,8 +4720,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4803,8 +4734,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4950,8 +4880,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4965,8 +4894,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5102,8 +5030,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5115,8 +5042,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5251,8 +5177,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5264,8 +5189,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5410,8 +5334,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5425,8 +5348,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5572,8 +5494,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5587,8 +5508,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5734,8 +5654,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5749,8 +5668,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5896,8 +5814,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5911,8 +5828,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6058,8 +5974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6073,8 +5988,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6220,8 +6134,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6235,8 +6148,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6382,8 +6294,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6397,8 +6308,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6544,8 +6454,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6559,8 +6468,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6684,8 +6592,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6695,8 +6602,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6816,8 +6722,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6827,8 +6732,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6948,8 +6852,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6959,8 +6862,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7080,8 +6982,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -7091,8 +6992,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7194,8 +7094,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7203,8 +7102,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7303,8 +7201,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7312,8 +7209,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7412,8 +7308,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7421,8 +7316,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7521,8 +7415,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7530,8 +7423,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7630,8 +7522,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7639,8 +7530,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7739,8 +7629,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7748,8 +7637,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7848,8 +7736,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7857,8 +7744,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7957,8 +7843,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7966,8 +7851,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8066,8 +7950,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -8075,8 +7958,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8193,8 +8075,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8204,8 +8085,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8325,8 +8205,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8336,8 +8215,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8457,8 +8335,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8468,8 +8345,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8580,8 +8456,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8590,8 +8465,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8701,8 +8575,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8711,8 +8584,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8822,8 +8694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8832,8 +8703,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8943,8 +8813,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8953,8 +8822,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9064,8 +8932,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9074,8 +8941,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9185,8 +9051,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9195,8 +9060,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9306,8 +9170,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9316,8 +9179,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9427,8 +9289,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9437,8 +9298,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9548,8 +9408,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9558,8 +9417,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9669,8 +9527,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9679,8 +9536,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9790,8 +9646,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9800,8 +9655,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9911,8 +9765,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9921,8 +9774,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10032,8 +9884,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10042,8 +9893,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10153,8 +10003,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10163,8 +10012,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10274,8 +10122,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10284,8 +10131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10413,8 +10259,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10425,8 +10270,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10558,8 +10402,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10570,8 +10413,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10703,8 +10545,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10715,8 +10556,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10848,8 +10688,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10860,8 +10699,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10993,8 +10831,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11005,8 +10842,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11138,8 +10974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11150,8 +10985,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11283,8 +11117,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11295,8 +11128,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11428,8 +11260,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11440,8 +11271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11573,8 +11403,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11585,8 +11414,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11718,8 +11546,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11730,8 +11557,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11863,8 +11689,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11875,8 +11700,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12008,8 +11832,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12020,8 +11843,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12153,8 +11975,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12165,8 +11986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12298,8 +12118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12310,8 +12129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12443,8 +12261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12455,8 +12272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -333,9 +333,8 @@ ; GFX11-WGP-LABEL: private_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -346,9 +345,8 @@ ; GFX11-CU-LABEL: private_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -684,11 +682,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm @@ -698,11 +695,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -223,9 +223,8 @@ ; GFX11-WGP-LABEL: private_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -236,9 +235,8 @@ ; GFX11-CU-LABEL: private_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -480,11 +478,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) @@ -495,11 +492,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir @@ -0,0 +1,11 @@ +# RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s + +# GFX11-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction *** +# GFX11-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc, implicit $vcc_lo +--- +name: vopd_cndmask_2sgpr +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc, implicit $vcc_lo +... diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir @@ -0,0 +1,543 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefix=PAIR %s + +--- | + @lds = external addrspace(3) global [8 x i8] + define void @vopd_schedule() { ret void } + define void @vopd_fmamk() { ret void } + define void @vopd_fmamk_fail() { ret void } + define void @vopd_cndmask() { ret void } + define void @vopd_mov() { ret void } + define void @vopd_mov_mov() { ret void } + define void @vopd_constants_fail() { ret void } + define void @vopd_constants_inlinable() { ret void } + define void @vopd_constants_same() { ret void } + define void @vopd_mov_fmaak_constants_same() { ret void } + define void @vopd_debug() { ret void } + define void @vopd_schedule_unconstrained() { ret void } + define void @vopd_schedule_unconstrained_2() { ret void } + define void @vopd_mov_fixup() { ret void } + define void @vopd_mov_fixup_fail() { ret void } +... + +--- +name: vopd_schedule +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_schedule + ; SCHED: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_schedule + ; PAIR: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; can fuse vgpr3 and vgpr6 writing insts only due to reg constraints + $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + +... + +--- +name: vopd_fmamk +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_fmamk + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_fmamk + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + ; should pair + $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_FMAMK_F32 $vgpr0, 10, $vgpr3, implicit $mode, implicit $exec + +... + +--- +name: vopd_fmamk_fail +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_fmamk_fail + ; SCHED: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr2 = V_XOR_B32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr4, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_fmamk_fail + ; PAIR: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr2 = V_XOR_B32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr4, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_XOR_B32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr4 = IMPLICIT_DEF + ; should not pair + $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_FMAMK_F32 $vgpr0, 10, $vgpr4, implicit $mode, implicit $exec + +... + +--- +name: vopd_cndmask +tracksRegLiveness: true +body: | + bb.0: + ; SCHED-LABEL: name: vopd_cndmask + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $sgpr20 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 $sgpr20, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr5 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-LABEL: name: vopd_cndmask + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $sgpr20 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32 $sgpr20, killed $vgpr1, $vgpr0, $vgpr3, implicit $exec, implicit $vcc, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + $sgpr20 = IMPLICIT_DEF + ; should pair + $vgpr2 = V_FMAC_F32_e32 $sgpr20, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; should not pair, uses 3 scalars (implicit vcc) + $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec + $vgpr7 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; should not pair, uses 3 scalars (implicit vcc) + $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec + $vgpr9 = V_CNDMASK_B32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + +... + +--- +name: vopd_mov +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_mov + ; SCHED: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_mov + ; PAIR: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + +... + +--- +name: vopd_mov_mov +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_mov_mov + ; SCHED: $sgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $sgpr7 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec + ; PAIR-LABEL: name: vopd_mov_mov + ; PAIR: $sgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $sgpr7 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec + $sgpr0 = IMPLICIT_DEF + $sgpr7 = IMPLICIT_DEF + $vgpr2 = V_MOV_B32_e32 $sgpr0, implicit $exec + $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec + +... + + +--- +name: vopd_constants_fail +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_constants_fail + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 99, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_constants_fail + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_FMAC_F32_e32 99, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + ; should not pair with two different literals + $vgpr2 = V_FMAC_F32_e32 99, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_FMAMK_F32 $vgpr0, 100, $vgpr3, implicit $mode, implicit $exec + +... + +--- +name: vopd_constants_inlinable +tracksRegLiveness: true +body: | + bb.0: + ; SCHED-LABEL: name: vopd_constants_inlinable + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_constants_inlinable + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + ; can pair since 4 is inlinable + $vgpr2 = V_FMAC_F32_e32 4, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_FMAMK_F32 $vgpr0, 100, $vgpr3, implicit $mode, implicit $exec + +... + + +--- +name: vopd_constants_same +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_constants_same + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_constants_same + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + ; should be able to pair using 1 deduplicated literal + $vgpr2 = V_FMAC_F32_e32 100, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_FMAMK_F32 $vgpr0, 100, $vgpr3, implicit $mode, implicit $exec + +... + +--- +name: vopd_mov_fmaak_constants_same +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_mov_fmaak_constants_same + ; SCHED: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 981467136, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAAK_F32 killed $sgpr0, killed $vgpr0, 981467136, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_mov_fmaak_constants_same + ; PAIR: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + ; should be able to pair using 1 deduplicated literal + $vgpr1 = V_MOV_B32_e32 981467136, implicit $exec + $vgpr2 = V_FMAAK_F32 $sgpr0, $vgpr0, 981467136, implicit $mode, implicit $exec + +... + +--- +name: vopd_debug +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_debug + ; SCHED: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: DBG_VALUE $vgpr0, 0, 0 + ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_debug + ; PAIR: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: DBG_VALUE $vgpr0, 0, 0 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + ; TODO Debug values disable VOPD creation + $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + DBG_VALUE $vgpr0, 0, 0 + $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + +... + +--- +name: vopd_schedule_unconstrained +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_schedule_unconstrained + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr17 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_schedule_unconstrained + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $exec, implicit $vcc, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $exec, implicit $vcc, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr9 = V_FMAMK_F32 $vgpr0, 10, $vgpr2, implicit $mode, implicit $exec + $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr17 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr18 = V_FMAMK_F32 $vgpr0, 10, $vgpr3, implicit $mode, implicit $exec + ; $vgpr11 = V_FMAC_F32_e32 10, $vgpr1, $vgpr11, implicit $mode, implicit $exec + $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr14 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + +... + +--- +name: vopd_schedule_unconstrained_2 +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_schedule_unconstrained_2 + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr20 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr35 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr29 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr20 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr20, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr17 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr37 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr21 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr24 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_schedule_unconstrained_2 + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr20 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $exec, implicit $vcc, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32 $vgpr0, $vgpr3, 10, $vgpr1, implicit $exec, implicit $mode, implicit $vcc, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $vcc, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr9 = V_FMAMK_F32 $vgpr0, 10, $vgpr2, implicit $mode, implicit $exec + $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr17 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr18 = V_FMAMK_F32 $vgpr0, 10, $vgpr3, implicit $mode, implicit $exec + ; $vgpr11 = V_FMAC_F32_e32 10, $vgpr1, $vgpr11, implicit $mode, implicit $exec + $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr14 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr20 = IMPLICIT_DEF + $vgpr20 = V_FMAC_F32_e32 10, $vgpr1, $vgpr20, implicit $mode, implicit $exec + $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr21 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr24 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr9 = V_FMAMK_F32 $vgpr0, 10, $vgpr2, implicit $mode, implicit $exec + $vgpr29 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr37 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr35 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr18 = V_FMAMK_F32 $vgpr0, 10, $vgpr3, implicit $mode, implicit $exec + ; $vgpr11 = V_FMAC_F32_e32 10, $vgpr1, $vgpr11, implicit $mode, implicit $exec + $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr32 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + +... + +--- +name: vopd_mov_fixup +tracksRegLiveness: true +body: | + bb.0: + ; SCHED-LABEL: name: vopd_mov_fixup + ; SCHED: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr0, killed $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr4 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + ; SCHED-NEXT: $vgpr5 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + ; PAIR-LABEL: name: vopd_mov_fixup + ; PAIR: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + ; should pair + $vgpr2 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; should pair + $vgpr4 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + $vgpr5 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec +... + +--- +name: vopd_mov_fixup_fail +tracksRegLiveness: true +body: | + bb.0: + ; SCHED-LABEL: name: vopd_mov_fixup_fail + ; SCHED: $vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec + ; PAIR-LABEL: name: vopd_mov_fixup_fail + ; PAIR: $vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + ; PAIR-NEXT: $vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec + $vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + $vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll b/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll --- a/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll @@ -12,10 +12,8 @@ ; CHECK-NEXT: lds_param_load v4, attr0.y wait_vdst:15 ; CHECK-NEXT: lds_param_load v5, attr0.z wait_vdst:15 ; CHECK-NEXT: s_mov_b32 exec_lo, s0 -; CHECK-NEXT: s_waitcnt expcnt(2) -; CHECK-NEXT: v_add_f32_e32 v0, v3, v0 ; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: v_add_f32_e32 v1, v4, v1 +; CHECK-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_add_f32 v1, v4, v1 ; CHECK-NEXT: s_waitcnt expcnt(0) ; CHECK-NEXT: v_add_f32_e32 v2, v5, v2 ; CHECK-NEXT: ; return to shader part epilog @@ -43,10 +41,8 @@ ; CHECK-NEXT: s_mov_b32 m0, s2 ; CHECK-NEXT: lds_direct_load v5 wait_vdst:15 ; CHECK-NEXT: s_mov_b32 exec_lo, s0 -; CHECK-NEXT: s_waitcnt expcnt(2) -; CHECK-NEXT: v_add_f32_e32 v0, v3, v0 ; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: v_add_f32_e32 v1, v4, v1 +; CHECK-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_add_f32 v1, v4, v1 ; CHECK-NEXT: s_waitcnt expcnt(0) ; CHECK-NEXT: v_add_f32_e32 v2, v5, v2 ; CHECK-NEXT: ; return to shader part epilog