diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -317,6 +317,9 @@ void initializeSIPostRABundlerPass(PassRegistry&); extern char &SIPostRABundlerID; +void initializeGCNCreateVOPDPass(PassRegistry &); +extern char &GCNCreateVOPDID; + void initializeAMDGPUUnifyDivergentExitNodesPass(PassRegistry&); extern char &AMDGPUUnifyDivergentExitNodesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -22,11 +22,13 @@ #include "AMDGPUTargetTransformInfo.h" #include "GCNIterativeScheduler.h" #include "GCNSchedStrategy.h" +#include "GCNVOPDUtils.h" #include "R600.h" #include "R600TargetMachine.h" #include "SIMachineFunctionInfo.h" #include "SIMachineScheduler.h" #include "TargetInfo/AMDGPUTargetInfo.h" +#include "Utils/AMDGPUBaseInfo.h" #include "llvm/Analysis/CGSCCPassManager.h" #include "llvm/CodeGen/GlobalISel/CSEInfo.h" #include "llvm/CodeGen/GlobalISel/IRTranslator.h" @@ -278,6 +280,12 @@ cl::desc("Enable s_delay_alu insertion"), cl::init(true), cl::Hidden); +// Enable GFX11+ VOPD +static cl::opt + EnableVOPD("amdgpu-enable-vopd", + cl::desc("Enable VOPD, dual issue of VALU in wave32"), + cl::init(true), cl::Hidden); + // Option is used in lit tests to prevent deadcoding of patterns inspected. static cl::opt EnableDCEInRA("amdgpu-dce-in-ra", @@ -383,6 +391,7 @@ initializeSIPreAllocateWWMRegsPass(*PR); initializeSIFormMemoryClausesPass(*PR); initializeSIPostRABundlerPass(*PR); + initializeGCNCreateVOPDPass(*PR); initializeAMDGPUUnifyDivergentExitNodesPass(*PR); initializeAMDGPUAAWrapperPassPass(*PR); initializeAMDGPUExternalAAWrapperPass(*PR); @@ -909,6 +918,8 @@ DAG->addMutation(ST.createFillMFMAShadowMutation(DAG->TII)); DAG->addMutation(createIGroupLPDAGMutation()); DAG->addMutation(createSchedBarrierDAGMutation()); + if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + DAG->addMutation(createVOPDPairingMutation()); return DAG; } @@ -1385,6 +1396,8 @@ } void GCNPassConfig::addPreEmitPass() { + if (isPassEnabled(EnableVOPD, CodeGenOpt::Less)) + addPass(&GCNCreateVOPDID); addPass(createSIMemoryLegalizerPass()); addPass(createSIInsertWaitcntsPass()); diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -100,6 +100,7 @@ AMDGPUUnifyDivergentExitNodes.cpp AMDGPUUnifyMetadata.cpp R600MachineCFGStructurizer.cpp + GCNCreateVOPD.cpp GCNDPPCombine.cpp GCNHazardRecognizer.cpp GCNILPSched.cpp @@ -109,6 +110,7 @@ GCNPreRAOptimizations.cpp GCNRegPressure.cpp GCNSchedStrategy.cpp + GCNVOPDUtils.cpp R600AsmPrinter.cpp R600ClauseMergePass.cpp R600ControlFlowFinalizer.cpp diff --git a/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNCreateVOPD.cpp @@ -0,0 +1,182 @@ +//===- GCNCreateVOPD.cpp - Create VOPD Instructions ----------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file +/// Combine VALU pairs into VOPD instructions +/// Only works on wave32 +/// Has register requirements, we reject creating VOPD if the requirements are +/// not met. +/// shouldCombineVOPD mutator in postRA machine scheduler puts candidate +/// instructions for VOPD back-to-back +/// +// +//===----------------------------------------------------------------------===// + +#include "AMDGPU.h" +#include "GCNSubtarget.h" +#include "GCNVOPDUtils.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "Utils/AMDGPUBaseInfo.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/Statistic.h" +#include "llvm/ADT/StringMap.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/Support/Casting.h" +#include "llvm/Support/Debug.h" +#include + +#define DEBUG_TYPE "gcn-create-vopd" +STATISTIC(NumVOPDCreated, "Number of VOPD Insts Created."); + +using namespace llvm; + +namespace { + +class GCNCreateVOPD : public MachineFunctionPass { +private: +public: + static char ID; + const GCNSubtarget *ST = nullptr; + + GCNCreateVOPD() : MachineFunctionPass(ID) {} + + void getAnalysisUsage(AnalysisUsage &AU) const override { + AU.setPreservesCFG(); + MachineFunctionPass::getAnalysisUsage(AU); + } + + StringRef getPassName() const override { + return "GCN Create VOPD Instructions"; + } + + bool doReplace(const SIInstrInfo *SII, + std::pair &Pair) { + auto *FirstMI = Pair.first; + auto *SecondMI = Pair.second; + unsigned Opc1 = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + int NewOpcode = AMDGPU::getVOPDFull(AMDGPU::getVOPDOpcode(Opc1), + AMDGPU::getVOPDOpcode(Opc2)); + assert(NewOpcode != -1 && + "Should have previously determined this as a possible VOPD\n"); + + auto VOPDInst = BuildMI(*FirstMI->getParent(), FirstMI, + FirstMI->getDebugLoc(), SII->get(NewOpcode)) + .setMIFlags(FirstMI->getFlags() | SecondMI->getFlags()); + VOPDInst.add(FirstMI->getOperand(0)) + .add(SecondMI->getOperand(0)) + .add(FirstMI->getOperand(1)); + + switch (Opc1) { + case AMDGPU::V_MOV_B32_e32: + break; + case AMDGPU::V_FMAMK_F32: + VOPDInst.add(FirstMI->getOperand(2)); + VOPDInst.add(FirstMI->getOperand(3)); + break; + case AMDGPU::V_FMAAK_F32: + VOPDInst.add(FirstMI->getOperand(2)); + VOPDInst.add(FirstMI->getOperand(3)); + break; + default: + VOPDInst.add(FirstMI->getOperand(2)); + break; + } + + VOPDInst.add(SecondMI->getOperand(1)); + + switch (Opc2) { + case AMDGPU::V_MOV_B32_e32: + break; + case AMDGPU::V_FMAMK_F32: + VOPDInst.add(SecondMI->getOperand(2)); + VOPDInst.add(SecondMI->getOperand(3)); + break; + case AMDGPU::V_FMAAK_F32: + VOPDInst.add(SecondMI->getOperand(2)); + VOPDInst.add(SecondMI->getOperand(3)); + break; + default: + VOPDInst.add(SecondMI->getOperand(2)); + break; + } + + VOPDInst.copyImplicitOps(*FirstMI); + VOPDInst.copyImplicitOps(*SecondMI); + + LLVM_DEBUG(dbgs() << "VOPD Fused: "; VOPDInst->dump(); + dbgs() << " from\tX: "; Pair.first->dump(); dbgs() << "\tY: "; + Pair.second->dump(); dbgs() << "\n"); + FirstMI->eraseFromParent(); + SecondMI->eraseFromParent(); + ++NumVOPDCreated; + return true; + } + + bool runOnMachineFunction(MachineFunction &MF) override { + if (skipFunction(MF.getFunction())) + return false; + ST = &MF.getSubtarget(); + if (!AMDGPU::hasVOPD(*ST) || !ST->isWave32()) + return false; + LLVM_DEBUG(dbgs() << "CreateVOPD Pass:\n"); + + const SIInstrInfo *SII = ST->getInstrInfo(); + bool Changed = false; + + SmallVector> ReplaceCandidates; + + for (auto &MBB : MF) { + auto MII = MBB.begin(), E = MBB.end(); + while (MII != E) { + auto *FirstMI = &*MII; + MII = next_nodbg(MII, MBB.end()); + if (MII == MBB.end()) + break; + if (FirstMI->isDebugInstr()) + continue; + auto *SecondMI = &*MII; + unsigned Opc = FirstMI->getOpcode(); + unsigned Opc2 = SecondMI->getOpcode(); + auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + std::pair Pair; + + if (FirstCanBeVOPD.X && SecondCanBeVOPD.Y) + Pair = {FirstMI, SecondMI}; + else if (FirstCanBeVOPD.Y && SecondCanBeVOPD.X) + Pair = {SecondMI, FirstMI}; + else + continue; + // checkVOPDRegConstraints cares about program order, but doReplace + // cares about X-Y order in the constituted VOPD + if (llvm::checkVOPDRegConstraints(*SII, *FirstMI, *SecondMI)) { + ReplaceCandidates.push_back(Pair); + ++MII; + } + } + } + for (auto &Pair : ReplaceCandidates) { + Changed |= doReplace(SII, Pair); + } + + return Changed; + } +}; + +} // namespace + +char GCNCreateVOPD::ID = 0; + +char &llvm::GCNCreateVOPDID = GCNCreateVOPD::ID; + +INITIALIZE_PASS(GCNCreateVOPD, DEBUG_TYPE, "GCN Create VOPD Instructions", + false, false) diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.h @@ -0,0 +1,32 @@ +//===- GCNVOPDUtils.h - GCN VOPD Utils ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU DAG scheduling +/// mutation to pair VOPD instructions back to back. It also contains +// subroutines useful in the creation of VOPD instructions +// +//===----------------------------------------------------------------------===// + +#ifndef LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H +#define LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H + +#include "llvm/CodeGen/MachineScheduler.h" + +namespace llvm { + +class SIInstrInfo; + +bool checkVOPDRegConstraints(const SIInstrInfo &TII, + const MachineInstr &FirstMI, + const MachineInstr &SecondMI); + +std::unique_ptr createVOPDPairingMutation(); + +} // namespace llvm + +#endif // LLVM_LIB_TARGET_AMDGPU_VOPDUTILS_H diff --git a/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp new file mode 100644 --- /dev/null +++ b/llvm/lib/Target/AMDGPU/GCNVOPDUtils.cpp @@ -0,0 +1,212 @@ +//===- GCNVOPDUtils.cpp - GCN VOPD Utils ------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +/// \file This file contains the AMDGPU DAG scheduling +/// mutation to pair VOPD instructions back to back. It also contains +// subroutines useful in the creation of VOPD instructions +// +//===----------------------------------------------------------------------===// + +#include "GCNVOPDUtils.h" +#include "AMDGPUSubtarget.h" +#include "GCNSubtarget.h" +#include "MCTargetDesc/AMDGPUMCTargetDesc.h" +#include "SIInstrInfo.h" +#include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/CodeGen/MachineBasicBlock.h" +#include "llvm/CodeGen/MachineInstr.h" +#include "llvm/CodeGen/MachineOperand.h" +#include "llvm/CodeGen/MachineRegisterInfo.h" +#include "llvm/CodeGen/MacroFusion.h" +#include "llvm/CodeGen/ScheduleDAG.h" +#include "llvm/CodeGen/ScheduleDAGMutation.h" +#include "llvm/CodeGen/TargetInstrInfo.h" +#include "llvm/MC/MCInst.h" + +using namespace llvm; + +#define DEBUG_TYPE "gcn-vopd-utils" + +bool llvm::checkVOPDRegConstraints(const SIInstrInfo &TII, + const MachineInstr &FirstMI, + const MachineInstr &SecondMI) { + const MachineFunction *MF = FirstMI.getMF(); + const GCNSubtarget &ST = MF->getSubtarget(); + const SIRegisterInfo *TRI = dyn_cast(ST.getRegisterInfo()); + const MachineRegisterInfo &MRI = MF->getRegInfo(); + const unsigned NumVGPRBanks = 4; + // Literals also count against scalar bus limit + SmallVector UniqueLiterals; + auto addLiteral = [&](const MachineOperand &Op) { + for (auto &Literal : UniqueLiterals) { + if (Literal->isIdenticalTo(Op)) + return; + } + UniqueLiterals.push_back(&Op); + }; + SmallVector UniqueScalarRegs; + assert([&]() -> bool { + for (auto MII = MachineBasicBlock::const_iterator(&FirstMI); + MII != FirstMI.getParent()->instr_end(); ++MII) { + if (&*MII == &SecondMI) + return true; + } + return false; + }() && "Expected FirstMI to precede SecondMI"); + // Cannot pair dependent instructions + for (const auto &Use : SecondMI.uses()) + if (Use.isReg() && FirstMI.modifiesRegister(Use.getReg())) + return false; + + struct ComponentInfo { + ComponentInfo(const MachineInstr &MI) : MI(MI) {} + Register Dst, Reg0, Reg1, Reg2; + const MachineInstr &MI; + }; + ComponentInfo CInfo[] = {ComponentInfo(FirstMI), ComponentInfo(SecondMI)}; + + for (ComponentInfo &Comp : CInfo) { + switch (Comp.MI.getOpcode()) { + case AMDGPU::V_FMAMK_F32: + // cannot inline the fixed literal in fmamk + addLiteral(Comp.MI.getOperand(2)); + Comp.Reg2 = Comp.MI.getOperand(3).getReg(); + break; + case AMDGPU::V_FMAAK_F32: + // cannot inline the fixed literal in fmaak + addLiteral(Comp.MI.getOperand(3)); + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + case AMDGPU::V_FMAC_F32_e32: + case AMDGPU::V_DOT2_F32_F16: + case AMDGPU::V_DOT2_F32_BF16: + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + Comp.Reg2 = Comp.MI.getOperand(0).getReg(); + break; + case AMDGPU::V_CNDMASK_B32_e32: + UniqueScalarRegs.push_back(AMDGPU::VCC_LO); + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + case AMDGPU::V_MOV_B32_e32: + break; + default: + Comp.Reg1 = Comp.MI.getOperand(2).getReg(); + break; + } + + Comp.Dst = Comp.MI.getOperand(0).getReg(); + + const MachineOperand &Op0 = Comp.MI.getOperand(1); + if (Op0.isReg()) { + if (!TRI->isVectorRegister(MRI, Op0.getReg())) { + if (!is_contained(UniqueScalarRegs, Op0.getReg())) + UniqueScalarRegs.push_back(Op0.getReg()); + } else + Comp.Reg0 = Op0.getReg(); + } else { + if (!TII.isInlineConstant(Comp.MI, 1)) + addLiteral(Op0); + } + } + + if (UniqueLiterals.size() > 1) + return false; + if ((UniqueLiterals.size() + UniqueScalarRegs.size()) > 2) + return false; + + // check port 0 + if (CInfo[0].Reg0 && CInfo[1].Reg0 && + CInfo[0].Reg0 % NumVGPRBanks == CInfo[1].Reg0 % NumVGPRBanks) + return false; + // check port 1 + if (CInfo[0].Reg1 && CInfo[1].Reg1 && + CInfo[0].Reg1 % NumVGPRBanks == CInfo[1].Reg1 % NumVGPRBanks) + return false; + // check port 2 + if (CInfo[0].Reg2 && CInfo[1].Reg2 && + !((CInfo[0].Reg2 ^ CInfo[1].Reg2) & 0x1)) + return false; + if (!((CInfo[0].Dst ^ CInfo[1].Dst) & 0x1)) + return false; + + LLVM_DEBUG(dbgs() << "VOPD Reg Constraints Passed\n\tX: "; FirstMI.dump(); + dbgs() << "\n\tY: "; SecondMI.dump(); dbgs() << "\n"); + return true; +} + +/// Check if the instr pair, FirstMI and SecondMI, should be scheduled +/// together. Given SecondMI, when FirstMI is unspecified, then check if +/// SecondMI may be part of a fused pair at all. +static bool shouldScheduleVOPDAdjacent(const TargetInstrInfo &TII, + const TargetSubtargetInfo &TSI, + const MachineInstr *FirstMI, + const MachineInstr &SecondMI) { + const SIInstrInfo &STII = static_cast(TII); + unsigned Opc2 = SecondMI.getOpcode(); + auto SecondCanBeVOPD = AMDGPU::getCanBeVOPD(Opc2); + + // One instruction case + if (!FirstMI) + return SecondCanBeVOPD.Y; + + unsigned Opc = FirstMI->getOpcode(); + auto FirstCanBeVOPD = AMDGPU::getCanBeVOPD(Opc); + + if (!((FirstCanBeVOPD.X && SecondCanBeVOPD.Y) || + (FirstCanBeVOPD.Y && SecondCanBeVOPD.X))) + return false; + + return checkVOPDRegConstraints(STII, *FirstMI, SecondMI); +} + +/// Adapts design from MacroFusion +/// Puts valid candidate instructions back-to-back so they can easily +/// be turned into VOPD instructions +/// Greedily pairs instruction candidates. O(n^2) algorithm. +struct VOPDPairingMutation : ScheduleDAGMutation { + ShouldSchedulePredTy shouldScheduleAdjacent; // NOLINT: function pointer + + VOPDPairingMutation( + ShouldSchedulePredTy shouldScheduleAdjacent) // NOLINT: function pointer + : shouldScheduleAdjacent(shouldScheduleAdjacent) {} + + void apply(ScheduleDAGInstrs *DAG) override { + const TargetInstrInfo &TII = *DAG->TII; + const GCNSubtarget &ST = DAG->MF.getSubtarget(); + if (!AMDGPU::hasVOPD(ST) || !ST.isWave32()) { + LLVM_DEBUG(dbgs() << "Target does not support VOPDPairingMutation\n"); + return; + } + + std::vector::iterator ISUI, JSUI; + for (ISUI = DAG->SUnits.begin(); ISUI != DAG->SUnits.end(); ++ISUI) { + const MachineInstr *IMI = ISUI->getInstr(); + if (!shouldScheduleAdjacent(TII, ST, nullptr, *IMI)) + continue; + if (!hasLessThanNumFused(*ISUI, 2)) + continue; + + for (JSUI = ISUI + 1; JSUI != DAG->SUnits.end(); ++JSUI) { + if (JSUI->isBoundaryNode()) + continue; + const MachineInstr *JMI = JSUI->getInstr(); + if (!hasLessThanNumFused(*JSUI, 2) || + !shouldScheduleAdjacent(TII, ST, IMI, *JMI)) + continue; + if (fuseInstructionPair(*DAG, *ISUI, *JSUI)) + break; + } + } + LLVM_DEBUG(dbgs() << "Completed VOPDPairingMutation\n"); + } +}; + +std::unique_ptr llvm::createVOPDPairingMutation() { + return std::make_unique(shouldScheduleVOPDAdjacent); +} diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -3974,6 +3974,14 @@ int Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0); int Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src1); int Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src2); + int Src3Idx = -1; + if (Src0Idx == -1) { + // VOPD V_DUAL_* instructions use different operand names. + Src0Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0X); + Src1Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1X); + Src2Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::src0Y); + Src3Idx = AMDGPU::getNamedOperandIdx(Opcode, AMDGPU::OpName::vsrc1Y); + } // Make sure the number of operands is correct. const MCInstrDesc &Desc = get(Opcode); @@ -4247,9 +4255,9 @@ // Only look at the true operands. Only a real operand can use the constant // bus, and we don't want to check pseudo-operands like the source modifier // flags. - for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx}) { + for (int OpIdx : {Src0Idx, Src1Idx, Src2Idx, Src3Idx}) { if (OpIdx == -1) - break; + continue; const MachineOperand &MO = MI.getOperand(OpIdx); if (usesConstantBus(MRI, MO, MI.getDesc().OpInfo[OpIdx])) { if (MO.isReg()) { diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td @@ -2948,6 +2948,27 @@ let ValueCols = [["1"]]; } +def VOPDComponentTable : GenericTable { + let FilterClass = "VOPD_Component"; + let CppTypeName = "VOPDComponentInfo"; + let Fields = ["BaseVOP", "VOPDOp", "CanBeVOPDX"]; + let PrimaryKey = ["BaseVOP"]; + let PrimaryKeyName = "getVOPDComponentHelper"; +} + +def VOPDPairs : GenericTable { + let FilterClass = "VOPD_Base"; + let CppTypeName = "VOPDInfo"; + let Fields = ["Opcode", "OpX", "OpY"]; + let PrimaryKey = ["Opcode"]; + let PrimaryKeyName = "getVOPDOpcodeHelper"; +} + +def getVOPDInfoFromComponentOpcodes : SearchIndex { + let Table = VOPDPairs; + let Key = ["OpX", "OpY"]; +} + include "SIInstructions.td" include "DSInstructions.td" diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.h @@ -465,6 +465,14 @@ LLVM_READONLY bool getMAIIsGFX940XDL(unsigned Opc); +struct CanBeVOPD { + bool X; + bool Y; +}; + +LLVM_READONLY +CanBeVOPD getCanBeVOPD(unsigned Opc); + LLVM_READONLY const GcnBufferFormatInfo *getGcnBufferFormatInfo(uint8_t BitsPerComp, uint8_t NumComponents, @@ -477,6 +485,12 @@ LLVM_READONLY int getMCOpcode(uint16_t Opcode, unsigned Gen); +LLVM_READONLY +unsigned getVOPDOpcode(unsigned Opc); + +LLVM_READONLY +int getVOPDFull(unsigned OpX, unsigned OpY); + void initDefaultAMDKernelCodeT(amd_kernel_code_t &Header, const MCSubtargetInfo *STI); diff --git a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp --- a/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp +++ b/llvm/lib/Target/AMDGPU/Utils/AMDGPUBaseInfo.cpp @@ -277,6 +277,18 @@ uint16_t Opcode; }; +struct VOPDComponentInfo { + uint16_t BaseVOP; + uint16_t VOPDOp; + bool CanBeVOPDX; +}; + +struct VOPDInfo { + uint16_t Opcode; + uint16_t OpX; + uint16_t OpY; +}; + #define GET_MTBUFInfoTable_DECL #define GET_MTBUFInfoTable_IMPL #define GET_MUBUFInfoTable_DECL @@ -293,6 +305,10 @@ #define GET_VOPC64DPPTable_IMPL #define GET_VOPC64DPP8Table_DECL #define GET_VOPC64DPP8Table_IMPL +#define GET_VOPDComponentTable_DECL +#define GET_VOPDComponentTable_IMPL +#define GET_VOPDPairs_DECL +#define GET_VOPDPairs_IMPL #include "AMDGPUGenSearchableTables.inc" int getMTBUFBaseOpcode(unsigned Opc) { @@ -394,6 +410,19 @@ return Info ? Info->is_gfx940_xdl : false; } +CanBeVOPD getCanBeVOPD(unsigned Opc) { + const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); + if (Info) + return {Info->CanBeVOPDX, 1}; + else + return {0, 0}; +} + +unsigned getVOPDOpcode(unsigned Opc) { + const VOPDComponentInfo *Info = getVOPDComponentHelper(Opc); + return Info ? Info->VOPDOp : ~0u; +} + // Wrapper for Tablegen'd function. enum Subtarget is not defined in any // header files, so we need to wrap it in a function that takes unsigned // instead. @@ -401,6 +430,11 @@ return getMCOpcodeGen(Opcode, static_cast(Gen)); } +int getVOPDFull(unsigned OpX, unsigned OpY) { + const VOPDInfo *Info = getVOPDInfoFromComponentOpcodes(OpX, OpY); + return Info ? Info->Opcode : -1; +} + namespace IsaInfo { AMDGPUTargetID::AMDGPUTargetID(const MCSubtargetInfo &STI) diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.gather4.dim.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=tahiti -o - %s | FileCheck -check-prefix=GFX6 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -o - %s | FileCheck -check-prefix=GFX10NSA %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -o - %s | FileCheck -check-prefix=GFX10NSA %s define amdgpu_ps <4 x float> @gather4_2d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, float %s, float %t) { ; GFX6-LABEL: gather4_2d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2d.ll @@ -94,14 +94,14 @@ ; GFX11-LABEL: load_2d_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -109,10 +109,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v7 -; GFX11-NEXT: v_mov_b32_e32 v1, v8 -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mov_b32_e32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] @@ -184,14 +182,14 @@ ; GFX11-LABEL: load_2d_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 -; GFX11-NEXT: v_mov_b32_e32 v8, v7 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -199,10 +197,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v7 -; GFX11-NEXT: v_mov_b32_e32 v1, v8 -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mov_b32_e32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.a16.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX9 %s ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX10 %s -; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s +; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10PLUS,GFX11 %s define amdgpu_ps <4 x float> @load_2darraymsaa_v4f32_xyzw(<8 x i32> inreg %rsrc, i16 %s, i16 %t, i16 %slice, i16 %fragid) { ; GFX9-LABEL: load_2darraymsaa_v4f32_xyzw: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.2darraymsaa.ll @@ -98,14 +98,13 @@ ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v8, v3 -; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; GFX11-NEXT: v_mov_b32_e32 v13, v9 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -115,10 +114,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-NEXT: v_mov_b32_e32 v1, v10 -; GFX11-NEXT: v_mov_b32_e32 v2, v11 -; GFX11-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[10:11] @@ -194,14 +191,13 @@ ; GFX11-LABEL: load_2darraymsaa_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: ; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v8, v3 -; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; GFX11-NEXT: v_mov_b32_e32 v13, v9 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 @@ -211,10 +207,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-NEXT: v_mov_b32_e32 v1, v10 -; GFX11-NEXT: v_mov_b32_e32 v2, v11 -; GFX11-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.a16.ll @@ -113,12 +113,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_lshl_b32 s8, s0, 16 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 -; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 ; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX11-NEXT: s_mov_b32 s1, s3 @@ -126,10 +126,8 @@ ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: v_mov_b32_e32 v1, v6 -; GFX11-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-NEXT: v_mov_b32_e32 v3, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v4, v9 ; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] @@ -211,12 +209,12 @@ ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s4, s6 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_lshl_b32 s8, s0, 16 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 -; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_mov_b32_e32 v9, v5 +; GFX11-NEXT: v_mov_b32_e32 v8, v5 ; GFX11-NEXT: v_and_or_b32 v10, 0xffff, v0, v1 ; GFX11-NEXT: v_and_or_b32 v11, 0xffff, v2, s8 ; GFX11-NEXT: s_mov_b32 s1, s3 @@ -224,10 +222,8 @@ ; GFX11-NEXT: s_mov_b32 s5, s7 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: v_mov_b32_e32 v1, v6 -; GFX11-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-NEXT: v_mov_b32_e32 v3, v8 -; GFX11-NEXT: v_mov_b32_e32 v4, v9 +; GFX11-NEXT: v_dual_mov_b32 v2, v7 :: v_dual_mov_b32 v1, v6 +; GFX11-NEXT: v_dual_mov_b32 v3, v8 :: v_dual_mov_b32 v4, v9 ; GFX11-NEXT: image_load v[0:4], v[10:11], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm a16 tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v5, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.load.3d.ll @@ -95,16 +95,15 @@ ; ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -112,10 +111,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[10:11] @@ -188,16 +185,15 @@ ; ; GFX11-LABEL: load_3d_v4f32_xyzw_tfe_lwe: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: s_mov_b32 s0, s2 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 -; GFX11-NEXT: s_mov_b32 s1, s3 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: s_mov_b32 s2, s4 ; GFX11-NEXT: s_mov_b32 s3, s5 ; GFX11-NEXT: s_mov_b32 s4, s6 @@ -205,10 +201,8 @@ ; GFX11-NEXT: s_mov_b32 s6, s8 ; GFX11-NEXT: s_mov_b32 s7, s9 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[10:11] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.sample.g16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -global-isel -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.image.store.2d.ll @@ -622,8 +622,7 @@ ; ; GFX11-LABEL: image_store_f32_dmask_1111: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v1, s10 -; GFX11-NEXT: v_mov_b32_e32 v2, s11 +; GFX11-NEXT: v_dual_mov_b32 v1, s10 :: v_dual_mov_b32 v2, s11 ; GFX11-NEXT: s_mov_b32 s0, s2 ; GFX11-NEXT: s_mov_b32 s1, s3 ; GFX11-NEXT: s_mov_b32 s2, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.interp.inreg.ll @@ -42,9 +42,8 @@ ; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15 ; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15 ; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 ; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.intersect_ray.ll @@ -228,10 +228,8 @@ ; ; GFX11-LABEL: image_bvh_intersect_ray_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v18, v0 -; GFX11-NEXT: v_mov_b32_e32 v19, v1 -; GFX11-NEXT: v_mov_b32_e32 v15, v2 -; GFX11-NEXT: v_mov_b32_e32 v16, v3 +; GFX11-NEXT: v_dual_mov_b32 v18, v0 :: v_dual_mov_b32 v19, v1 +; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3 ; GFX11-NEXT: v_mov_b32_e32 v17, v4 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB6_1: ; =>This Inner Loop Header: Depth=1 @@ -346,19 +344,18 @@ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v13, v0 +; GFX11-NEXT: v_dual_mov_b32 v13, v0 :: v_dual_mov_b32 v14, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v5 -; GFX11-NEXT: v_mov_b32_e32 v14, v1 -; GFX11-NEXT: v_mov_b32_e32 v15, v2 -; GFX11-NEXT: v_mov_b32_e32 v16, v3 +; GFX11-NEXT: v_dual_mov_b32 v15, v2 :: v_dual_mov_b32 v16, v3 +; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_lshlrev_b32 v2, 16, v5 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v7 -; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v5 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_3) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v6 -; GFX11-NEXT: v_mov_b32_e32 v17, v4 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: v_and_or_b32 v4, 0xffff, v7, v2 ; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v1, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v8, v3 ; GFX11-NEXT: .LBB7_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v9 @@ -470,12 +467,9 @@ ; ; GFX11-LABEL: image_bvh64_intersect_ray_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v19, v0 -; GFX11-NEXT: v_mov_b32_e32 v20, v1 -; GFX11-NEXT: v_mov_b32_e32 v21, v2 -; GFX11-NEXT: v_mov_b32_e32 v16, v3 -; GFX11-NEXT: v_mov_b32_e32 v17, v4 -; GFX11-NEXT: v_mov_b32_e32 v18, v5 +; GFX11-NEXT: v_dual_mov_b32 v19, v0 :: v_dual_mov_b32 v20, v1 +; GFX11-NEXT: v_dual_mov_b32 v21, v2 :: v_dual_mov_b32 v16, v3 +; GFX11-NEXT: v_dual_mov_b32 v17, v4 :: v_dual_mov_b32 v18, v5 ; GFX11-NEXT: s_mov_b32 s1, exec_lo ; GFX11-NEXT: .LBB8_1: ; =>This Inner Loop Header: Depth=1 ; GFX11-NEXT: v_readfirstlane_b32 s4, v12 @@ -595,18 +589,17 @@ ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16_vgpr_descr: ; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v14, v0 +; GFX11-NEXT: v_dual_mov_b32 v14, v0 :: v_dual_mov_b32 v15, v1 ; GFX11-NEXT: v_lshrrev_b32_e32 v0, 16, v6 -; GFX11-NEXT: v_mov_b32_e32 v15, v1 -; GFX11-NEXT: v_mov_b32_e32 v16, v2 -; GFX11-NEXT: v_mov_b32_e32 v17, v3 +; GFX11-NEXT: v_dual_mov_b32 v16, v2 :: v_dual_mov_b32 v17, v3 ; GFX11-NEXT: v_lshrrev_b32_e32 v1, 16, v8 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 16, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(SKIP_2) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 16, v0 -; GFX11-NEXT: v_lshlrev_b32_e32 v3, 16, v7 -; GFX11-NEXT: v_mov_b32_e32 v18, v4 +; GFX11-NEXT: v_dual_mov_b32 v18, v4 :: v_dual_lshlrev_b32 v3, 16, v7 ; GFX11-NEXT: v_mov_b32_e32 v19, v5 ; GFX11-NEXT: v_and_or_b32 v4, 0xffff, v8, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_and_or_b32 v5, 0xffff, v1, v0 ; GFX11-NEXT: v_and_or_b32 v6, 0xffff, v9, v3 ; GFX11-NEXT: s_mov_b32 s1, exec_lo @@ -710,35 +703,31 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 +; GFX11-NEXT: s_mov_b32 s8, 2.0 ; GFX11-NEXT: s_mov_b32 s9, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: s_mov_b32 s8, 2.0 ; GFX11-NEXT: s_mov_b32 s11, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s10, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 -; GFX11-NEXT: v_mov_b32_e32 v8, s14 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo -; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: flat_load_b32 v9, v[0:1] ; GFX11-NEXT: flat_load_b32 v10, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v3, s9 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-NEXT: v_mov_b32_e32 v2, s8 -; GFX11-NEXT: v_mov_b32_e32 v4, s10 -; GFX11-NEXT: v_mov_b32_e32 v5, s11 +; GFX11-NEXT: s_mov_b32 s7, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s9 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8 +; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v4, s10 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v9, v10, v[0:2], v[3:5], v[6:8]], s[0:3] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -865,29 +854,26 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v4, 2, v0 -; GFX11-NEXT: s_mov_b32 s9, 0x42004600 ; GFX11-NEXT: s_mov_b32 s8, 2.0 +; GFX11-NEXT: s_mov_b32 s9, 0x42004600 ; GFX11-NEXT: s_mov_b32 s10, 0x44004700 ; GFX11-NEXT: s_mov_b32 s11, 0x45004800 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 -; GFX11-NEXT: v_mov_b32_e32 v1, s5 -; GFX11-NEXT: v_mov_b32_e32 v2, s6 -; GFX11-NEXT: v_mov_b32_e32 v3, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5 +; GFX11-NEXT: v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7 ; GFX11-NEXT: s_mov_b32 s6, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) ; GFX11-NEXT: v_add_co_u32 v2, vcc_lo, v2, v4 ; GFX11-NEXT: v_add_co_ci_u32_e32 v3, vcc_lo, 0, v3, vcc_lo ; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v3, s9 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 -; GFX11-NEXT: v_mov_b32_e32 v2, s8 -; GFX11-NEXT: v_mov_b32_e32 v4, s10 -; GFX11-NEXT: v_mov_b32_e32 v5, s11 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v3, s9 +; GFX11-NEXT: v_dual_mov_b32 v1, s7 :: v_dual_mov_b32 v2, s8 +; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v4, s10 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[0:2], v[3:5]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -978,34 +964,29 @@ ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 +; GFX11-NEXT: s_mov_b32 s8, 2.0 +; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: s_mov_b32 s6, 0 ; GFX11-NEXT: s_mov_b32 s9, 0x40400000 ; GFX11-NEXT: s_mov_b32 s12, 0x40c00000 -; GFX11-NEXT: s_mov_b32 s8, 2.0 -; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: s_mov_b32 s11, 0x40a00000 ; GFX11-NEXT: s_mov_b32 s10, 4.0 ; GFX11-NEXT: s_mov_b32 s14, 0x41000000 ; GFX11-NEXT: s_mov_b32 s13, 0x40e00000 -; GFX11-NEXT: v_mov_b32_e32 v3, s9 ; GFX11-NEXT: v_mov_b32_e32 v6, s12 -; GFX11-NEXT: v_mov_b32_e32 v4, s10 -; GFX11-NEXT: v_mov_b32_e32 v5, s11 -; GFX11-NEXT: v_mov_b32_e32 v7, s13 -; GFX11-NEXT: v_mov_b32_e32 v8, s14 +; GFX11-NEXT: v_dual_mov_b32 v8, s14 :: v_dual_mov_b32 v3, s9 +; GFX11-NEXT: v_dual_mov_b32 v4, s10 :: v_dual_mov_b32 v7, s13 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0xb36211c7 ; GFX11-NEXT: s_movk_i32 s5, 0x102 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: v_mov_b32_e32 v10, s5 +; GFX11-NEXT: v_dual_mov_b32 v10, s5 :: v_dual_mov_b32 v9, s4 ; GFX11-NEXT: v_add_co_u32 v0, vcc_lo, v0, v2 ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo -; GFX11-NEXT: v_mov_b32_e32 v9, s4 ; GFX11-NEXT: flat_load_b32 v11, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[9:10], v11, v[0:2], v[3:5], v[6:8]], s[0:3] @@ -1131,11 +1112,9 @@ ; GFX11-NEXT: s_mov_b32 s7, 1.0 ; GFX11-NEXT: s_mov_b32 s10, 0x44004700 ; GFX11-NEXT: s_mov_b32 s11, 0x45004800 -; GFX11-NEXT: v_mov_b32_e32 v3, s9 -; GFX11-NEXT: v_mov_b32_e32 v4, s10 -; GFX11-NEXT: v_mov_b32_e32 v5, s11 +; GFX11-NEXT: v_dual_mov_b32 v3, s9 :: v_dual_mov_b32 v4, s10 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s4 +; GFX11-NEXT: v_dual_mov_b32 v5, s11 :: v_dual_mov_b32 v0, s4 ; GFX11-NEXT: v_mov_b32_e32 v1, s5 ; GFX11-NEXT: s_mov_b32 s4, 0xb36211c6 ; GFX11-NEXT: s_movk_i32 s5, 0x102 @@ -1145,8 +1124,7 @@ ; GFX11-NEXT: v_add_co_ci_u32_e32 v1, vcc_lo, 0, v1, vcc_lo ; GFX11-NEXT: v_mov_b32_e32 v6, s4 ; GFX11-NEXT: flat_load_b32 v8, v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, s6 -; GFX11-NEXT: v_mov_b32_e32 v1, s7 +; GFX11-NEXT: v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7 ; GFX11-NEXT: v_mov_b32_e32 v2, s8 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh64_intersect_ray v[0:3], [v[6:7], v8, v[0:2], v[3:5]], s[0:3] a16 diff --git a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll --- a/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll +++ b/llvm/test/CodeGen/AMDGPU/atomic_optimizations_local_pointer.ll @@ -212,10 +212,9 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB0_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, s3 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -454,11 +453,10 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB1_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_add_rtn_u32 v1, v1, v2 @@ -1569,8 +1567,8 @@ ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 ; GFX1132-NEXT: s_mul_i32 s5, s2, s5 ; GFX1132-NEXT: s_add_i32 s7, s7, s6 -; GFX1132-NEXT: v_mov_b32_e32 v0, s5 -; GFX1132-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_add_rtn_u64 v[0:1], v3, v[0:1] @@ -1878,10 +1876,9 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB7_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s3, s3 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) ; GFX1132-NEXT: s_mul_i32 s3, s3, 5 -; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, s3 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s3 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -2124,11 +2121,10 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB8_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: s_bcnt1_i32_b32 s2, s2 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 ; GFX1132-NEXT: s_waitcnt lgkmcnt(0) ; GFX1132-NEXT: s_mul_i32 s2, s0, s2 ; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX1132-NEXT: v_mov_b32_e32 v2, s2 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, s2 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_sub_rtn_u32 v1, v1, v2 @@ -3265,8 +3261,8 @@ ; GFX1132-NEXT: s_mul_hi_u32 s7, s2, s5 ; GFX1132-NEXT: s_mul_i32 s5, s2, s5 ; GFX1132-NEXT: s_add_i32 s7, s7, s6 -; GFX1132-NEXT: v_mov_b32_e32 v0, s5 -; GFX1132-NEXT: v_mov_b32_e32 v1, s7 +; GFX1132-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX1132-NEXT: v_dual_mov_b32 v0, s5 :: v_dual_mov_b32 v1, s7 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_sub_rtn_u64 v[0:1], v3, v[0:1] @@ -5037,8 +5033,7 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB18_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_max_rtn_i64 v[0:1], v2, v[0:1] @@ -5648,8 +5643,7 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB20_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_min_rtn_i64 v[0:1], v2, v[0:1] @@ -6256,8 +6250,7 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB22_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_max_rtn_u64 v[0:1], v2, v[0:1] @@ -6864,8 +6857,7 @@ ; GFX1132-NEXT: s_cbranch_execz .LBB24_2 ; GFX1132-NEXT: ; %bb.1: ; GFX1132-NEXT: v_mov_b32_e32 v0, 5 -; GFX1132-NEXT: v_mov_b32_e32 v1, 0 -; GFX1132-NEXT: v_mov_b32_e32 v2, 0 +; GFX1132-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v2, 0 ; GFX1132-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX1132-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX1132-NEXT: ds_min_rtn_u64 v[0:1], v2, v[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll --- a/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll +++ b/llvm/test/CodeGen/AMDGPU/dual-source-blend-export.ll @@ -8,17 +8,16 @@ ; GCN-NEXT: s_mov_b32 s1, exec_lo ; GCN-NEXT: s_wqm_b32 exec_lo, exec_lo ; GCN-NEXT: s_mov_b32 m0, s0 -; GCN-NEXT: v_mov_b32_e32 v2, v0 +; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 ; GCN-NEXT: lds_param_load v3, attr1.x wait_vdst:15 ; GCN-NEXT: lds_param_load v4, attr1.y wait_vdst:15 ; GCN-NEXT: lds_param_load v5, attr1.z wait_vdst:15 ; GCN-NEXT: lds_param_load v6, attr1.w wait_vdst:15 -; GCN-NEXT: v_mbcnt_lo_u32_b32 v7, -1, 0 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GCN-NEXT: v_mbcnt_hi_u32_b32 v7, -1, v7 -; GCN-NEXT: v_and_b32_e32 v7, 1, v7 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_4) | instid1(VALU_DEP_4) +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GCN-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_and_b32 v7, 1, v7 ; GCN-NEXT: v_cmp_eq_u32_e32 vcc_lo, 0, v7 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p10_f32 v8, v4, v2, v4 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v10, v5, v2, v5 wait_exp:1 ; GCN-NEXT: v_interp_p10_f32 v9, v6, v2, v6 @@ -32,9 +31,8 @@ ; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] ; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GCN-NEXT: v_mov_b32_dpp v6, v6 dpp8:[1,0,3,2,5,4,7,6] -; GCN-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc_lo -; GCN-NEXT: v_cndmask_b32_e32 v4, v5, v4, vcc_lo -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GCN-NEXT: v_dual_cndmask_b32 v3, v4, v5 :: v_dual_cndmask_b32 v4, v5, v4 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GCN-NEXT: v_cndmask_b32_e32 v5, v2, v6, vcc_lo ; GCN-NEXT: v_cndmask_b32_e32 v2, v6, v2, vcc_lo ; GCN-NEXT: v_mov_b32_dpp v4, v4 dpp8:[1,0,3,2,5,4,7,6] diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-svs.ll @@ -47,8 +47,7 @@ ; GFX11-SDAG-LABEL: soff1_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 @@ -64,13 +63,11 @@ ; GFX11-GISEL-LABEL: soff1_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -135,11 +132,10 @@ ; GFX11-SDAG-LABEL: soff1_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -154,13 +150,11 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -225,10 +219,8 @@ ; GFX11-SDAG-LABEL: soff1_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_add_i32 s0, s0, 4 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, s0 offset:1 dlc @@ -244,13 +236,11 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 ; GFX11-GISEL-NEXT: s_delay_alu instid0(VALU_DEP_1) -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -315,8 +305,7 @@ ; GFX11-SDAG-LABEL: soff2_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 @@ -334,14 +323,12 @@ ; GFX11-GISEL-LABEL: soff2_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -408,13 +395,11 @@ ; GFX11-SDAG-LABEL: soff2_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v2, 2 :: v_dual_mov_b32 v3, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 -; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) +; GFX11-SDAG-NEXT: s_delay_alu instid0(VALU_DEP_2) | instid1(SALU_CYCLE_1) ; GFX11-SDAG-NEXT: v_add3_u32 v0, 4, s0, v0 ; GFX11-SDAG-NEXT: scratch_store_b8 v0, v1, off offset:1 dlc ; GFX11-SDAG-NEXT: s_waitcnt_vscnt null, 0x0 @@ -429,14 +414,12 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -503,10 +486,8 @@ ; GFX11-SDAG-LABEL: soff2_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -524,14 +505,12 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 1 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -596,9 +575,8 @@ ; GFX11-SDAG-LABEL: soff4_voff1: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v4, 4 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -616,14 +594,12 @@ ; GFX11-GISEL-LABEL: soff4_voff1: ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -690,8 +666,7 @@ ; GFX11-SDAG-LABEL: soff4_voff2: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_lshlrev_b32 v0, 1, v0 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 ; GFX11-SDAG-NEXT: v_mov_b32_e32 v4, 4 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) @@ -712,14 +687,12 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc @@ -786,10 +759,8 @@ ; GFX11-SDAG-LABEL: soff4_voff4: ; GFX11-SDAG: ; %bb.0: ; %bb ; GFX11-SDAG-NEXT: s_load_b32 s0, s[0:1], 0x24 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-SDAG-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-SDAG-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v3, 4 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-SDAG-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-SDAG-NEXT: s_delay_alu instid0(SALU_CYCLE_1) @@ -807,14 +778,12 @@ ; GFX11-GISEL: ; %bb.0: ; %bb ; GFX11-GISEL-NEXT: s_load_b32 s0, s[0:1], 0x24 ; GFX11-GISEL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v2, 1 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v3, 4 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v2, 1 :: v_dual_mov_b32 v3, 4 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-GISEL-NEXT: s_lshl_b32 s0, s0, 2 ; GFX11-GISEL-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-GISEL-NEXT: v_add_nc_u32_e64 v1, s0, 4 -; GFX11-GISEL-NEXT: v_add_nc_u32_e32 v0, v1, v0 -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 2 :: v_dual_add_nc_u32 v0, v1, v0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v2, off offset:1 dlc ; GFX11-GISEL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-GISEL-NEXT: scratch_store_b8 v0, v1, off offset:2 dlc diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch.ll @@ -58,10 +58,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:64 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:48 @@ -175,10 +173,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:64 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:48 @@ -239,10 +235,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 @@ -313,10 +307,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:48 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:32 @@ -685,9 +677,8 @@ ; ; GFX11-LABEL: store_load_vindex_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 4, v0 ; GFX11-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -748,9 +739,8 @@ ; ; GFX11-PAL-LABEL: store_load_vindex_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 4, v0 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v1, off offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -815,10 +805,9 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-NEXT: scratch_store_b32 v0, v2, s32 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -871,10 +860,9 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 -; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-PAL-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 ; GFX11-PAL-NEXT: scratch_store_b32 v0, v2, s32 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1035,10 +1023,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], off offset:288 @@ -1164,10 +1150,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:272 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], off offset:288 @@ -1237,10 +1221,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 @@ -1319,10 +1301,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: s_clause 0x3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:256 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], s32 offset:272 @@ -1764,8 +1744,7 @@ ; ; GFX11-LABEL: store_load_vindex_small_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 @@ -1859,8 +1838,7 @@ ; ; GFX11-PAL-LABEL: store_load_vindex_small_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_sub_nc_u32_e32 v2, 0x104, v0 @@ -1927,9 +1905,8 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_lshlrev_b32_e32 v1, 2, v1 @@ -1993,9 +1970,8 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v1, 2, v1 @@ -2095,10 +2071,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo ; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 @@ -2234,10 +2208,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo ; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4010 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 @@ -2317,10 +2289,8 @@ ; GFX11-NEXT: s_mov_b32 s1, s0 ; GFX11-NEXT: s_mov_b32 s2, s0 ; GFX11-NEXT: s_mov_b32 s3, s0 -; GFX11-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 @@ -2441,10 +2411,8 @@ ; GFX11-PAL-NEXT: s_mov_b32 s1, s0 ; GFX11-PAL-NEXT: s_mov_b32 s2, s0 ; GFX11-PAL-NEXT: s_mov_b32 s3, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, s1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, s2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, s3 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4010 ; GFX11-PAL-NEXT: scratch_store_b128 off, v[0:3], vcc_lo offset:16 @@ -2871,8 +2839,7 @@ ; ; GFX11-LABEL: store_load_vindex_large_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -2968,8 +2935,7 @@ ; ; GFX11-PAL-LABEL: store_load_vindex_large_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-PAL-NEXT: s_movk_i32 vcc_lo, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, off offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3037,9 +3003,8 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX11-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -3107,9 +3072,8 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_and_b32_e32 v1, 15, v0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v2, 15 :: v_dual_and_b32 v1, 15, v0 ; GFX11-PAL-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: s_add_i32 vcc_lo, s32, 0x4004 ; GFX11-PAL-NEXT: scratch_load_b32 v3, off, s32 offset:4 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) @@ -3190,8 +3154,7 @@ ; ; GFX11-LABEL: store_load_large_imm_offset_kernel: ; GFX11: ; %bb.0: ; %bb -; GFX11-NEXT: v_mov_b32_e32 v0, 13 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x3000 +; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3285,8 +3248,7 @@ ; ; GFX11-PAL-LABEL: store_load_large_imm_offset_kernel: ; GFX11-PAL: ; %bb.0: ; %bb -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 13 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x3000 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, off offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3344,8 +3306,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v0, 13 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x3000 +; GFX11-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3406,8 +3367,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 13 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x3000 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 13 :: v_dual_mov_b32 v1, 0x3000 ; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 15 ; GFX11-PAL-NEXT: scratch_store_b32 off, v0, s32 offset:4 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3601,8 +3561,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc @@ -3647,8 +3606,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc @@ -3698,8 +3656,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; GFX11-NEXT: scratch_store_b64 v0, v[1:2], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc @@ -3744,8 +3701,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 15 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 0 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 15 :: v_dual_mov_b32 v2, 0 ; GFX11-PAL-NEXT: scratch_store_b64 v0, v[1:2], off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b64 v[0:1], v0, off glc dlc @@ -3797,8 +3753,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-NEXT: v_mov_b32_e32 v3, 3 ; GFX11-NEXT: scratch_store_b96 v0, v[1:3], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3847,8 +3802,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 2 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 ; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3 ; GFX11-PAL-NEXT: scratch_store_b96 v0, v[1:3], off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3904,10 +3858,8 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-NEXT: v_mov_b32_e32 v3, 3 -; GFX11-NEXT: v_mov_b32_e32 v4, 4 +; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 ; GFX11-NEXT: scratch_store_b128 v0, v[1:4], off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc @@ -3958,10 +3910,8 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 -; GFX11-PAL-NEXT: v_mov_b32_e32 v2, 2 -; GFX11-PAL-NEXT: v_mov_b32_e32 v3, 3 -; GFX11-PAL-NEXT: v_mov_b32_e32 v4, 4 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_mov_b32 v2, 2 +; GFX11-PAL-NEXT: v_dual_mov_b32 v3, 3 :: v_dual_mov_b32 v4, 4 ; GFX11-PAL-NEXT: scratch_store_b128 v0, v[1:4], off dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], v0, off glc dlc @@ -4109,8 +4059,7 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 -; GFX11-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 ; GFX11-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc @@ -4167,8 +4116,7 @@ ; GFX11-PAL: ; %bb.0: ; %bb ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-PAL-NEXT: v_add_nc_u32_e32 v0, 0xfffff000, v0 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 1 +; GFX11-PAL-NEXT: v_dual_mov_b32 v1, 1 :: v_dual_add_nc_u32 v0, 0xfffff000, v0 ; GFX11-PAL-NEXT: scratch_store_b8 v0, v1, off offset:-129 dlc ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_u8 v0, v0, off offset:-129 glc dlc @@ -4243,8 +4191,7 @@ ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, 16 -; GFX11-NEXT: v_mov_b32_e32 v1, 0x810 +; GFX11-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v0 ; GFX11-NEXT: ;;#ASMEND @@ -4344,8 +4291,7 @@ ; GFX11-PAL-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-PAL-NEXT: scratch_load_b128 v[0:3], off, off offset:3024 glc dlc ; GFX11-PAL-NEXT: s_waitcnt vmcnt(0) -; GFX11-PAL-NEXT: v_mov_b32_e32 v0, 16 -; GFX11-PAL-NEXT: v_mov_b32_e32 v1, 0x810 +; GFX11-PAL-NEXT: v_dual_mov_b32 v0, 16 :: v_dual_mov_b32 v1, 0x810 ; GFX11-PAL-NEXT: ;;#ASMSTART ; GFX11-PAL-NEXT: ; use v0 ; GFX11-PAL-NEXT: ;;#ASMEND diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -377,6 +377,7 @@ ; GCN-O1-NEXT: Branch Probability Basic Block Placement ; GCN-O1-NEXT: Insert fentry calls ; GCN-O1-NEXT: Insert XRay ops +; GCN-O1-NEXT: GCN Create VOPD Instructions ; GCN-O1-NEXT: SI Memory Legalizer ; GCN-O1-NEXT: MachinePostDominator Tree Construction ; GCN-O1-NEXT: SI insert wait instructions @@ -665,6 +666,7 @@ ; GCN-O1-OPTS-NEXT: Branch Probability Basic Block Placement ; GCN-O1-OPTS-NEXT: Insert fentry calls ; GCN-O1-OPTS-NEXT: Insert XRay ops +; GCN-O1-OPTS-NEXT: GCN Create VOPD Instructions ; GCN-O1-OPTS-NEXT: SI Memory Legalizer ; GCN-O1-OPTS-NEXT: MachinePostDominator Tree Construction ; GCN-O1-OPTS-NEXT: SI insert wait instructions @@ -955,6 +957,7 @@ ; GCN-O2-NEXT: Branch Probability Basic Block Placement ; GCN-O2-NEXT: Insert fentry calls ; GCN-O2-NEXT: Insert XRay ops +; GCN-O2-NEXT: GCN Create VOPD Instructions ; GCN-O2-NEXT: SI Memory Legalizer ; GCN-O2-NEXT: MachinePostDominator Tree Construction ; GCN-O2-NEXT: SI insert wait instructions @@ -1258,6 +1261,7 @@ ; GCN-O3-NEXT: Branch Probability Basic Block Placement ; GCN-O3-NEXT: Insert fentry calls ; GCN-O3-NEXT: Insert XRay ops +; GCN-O3-NEXT: GCN Create VOPD Instructions ; GCN-O3-NEXT: SI Memory Legalizer ; GCN-O3-NEXT: MachinePostDominator Tree Construction ; GCN-O3-NEXT: SI insert wait instructions diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.fdot2.f32.bf16.ll @@ -8,13 +8,12 @@ ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_clamp: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 clamp ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -38,13 +37,12 @@ ; GFX11-LABEL: test_llvm_amdgcn_fdot2_f32_bf16_no_clamp: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_load_b256 s[0:7], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_load_b32 s6, s[6:7], 0x0 ; GFX11-NEXT: s_load_b32 s2, s[2:3], 0x0 ; GFX11-NEXT: s_load_b32 s3, s[4:5], 0x0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s6 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_dot2_f32_bf16 v0, s2, s3, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.dim.ll @@ -124,17 +124,14 @@ ; ; GFX11-LABEL: load_1d_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, v6 -; GFX11-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-NEXT: v_mov_b32_e32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v6, v4, s[8:9] @@ -231,17 +228,14 @@ ; ; GFX11-LABEL: load_1d_lwe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, v6 -; GFX11-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-NEXT: v_mov_b32_e32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: image_load v[0:4], v5, s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v6, v4, s[8:9] @@ -377,18 +371,15 @@ ; ; GFX11-LABEL: load_2d_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v0, v7 -; GFX11-NEXT: v_mov_b32_e32 v1, v8 -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mov_b32_e32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -528,19 +519,15 @@ ; ; GFX11-LABEL: load_3d_tfe_lwe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_3D unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -680,19 +667,15 @@ ; ; GFX11-LABEL: load_cube_lwe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_CUBE unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -828,18 +811,15 @@ ; ; GFX11-LABEL: load_1darray_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v0, v7 -; GFX11-NEXT: v_mov_b32_e32 v1, v8 -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mov_b32_e32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -979,19 +959,15 @@ ; ; GFX11-LABEL: load_2darray_lwe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_ARRAY unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -1131,19 +1107,15 @@ ; ; GFX11-LABEL: load_2dmsaa_both: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA unorm tfe lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -1287,20 +1259,16 @@ ; ; GFX11-LABEL: load_2darraymsaa_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v9, 0 -; GFX11-NEXT: v_mov_b32_e32 v8, v3 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v9, 0 :: v_dual_mov_b32 v8, v3 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v11, v9 -; GFX11-NEXT: v_mov_b32_e32 v12, v9 ; GFX11-NEXT: v_mov_b32_e32 v13, v9 +; GFX11-NEXT: v_mov_b32_e32 v12, v9 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 ; GFX11-NEXT: v_mov_b32_e32 v0, v9 -; GFX11-NEXT: v_mov_b32_e32 v1, v10 -; GFX11-NEXT: v_mov_b32_e32 v2, v11 -; GFX11-NEXT: v_mov_b32_e32 v3, v12 -; GFX11-NEXT: v_mov_b32_e32 v4, v13 +; GFX11-NEXT: v_dual_mov_b32 v2, v11 :: v_dual_mov_b32 v3, v12 +; GFX11-NEXT: v_dual_mov_b32 v1, v10 :: v_dual_mov_b32 v4, v13 ; GFX11-NEXT: image_load v[0:4], v[5:8], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D_MSAA_ARRAY unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v9, v4, s[8:9] @@ -1436,18 +1404,15 @@ ; ; GFX11-LABEL: load_mip_1d_lwe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v7, 0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 +; GFX11-NEXT: v_dual_mov_b32 v7, 0 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v9, v7 -; GFX11-NEXT: v_mov_b32_e32 v10, v7 ; GFX11-NEXT: v_mov_b32_e32 v11, v7 +; GFX11-NEXT: v_mov_b32_e32 v10, v7 +; GFX11-NEXT: v_mov_b32_e32 v8, v7 ; GFX11-NEXT: v_mov_b32_e32 v0, v7 -; GFX11-NEXT: v_mov_b32_e32 v1, v8 -; GFX11-NEXT: v_mov_b32_e32 v2, v9 -; GFX11-NEXT: v_mov_b32_e32 v3, v10 -; GFX11-NEXT: v_mov_b32_e32 v4, v11 +; GFX11-NEXT: v_dual_mov_b32 v2, v9 :: v_dual_mov_b32 v3, v10 +; GFX11-NEXT: v_dual_mov_b32 v1, v8 :: v_dual_mov_b32 v4, v11 ; GFX11-NEXT: image_load_mip v[0:4], v[5:6], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_1D unorm lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v7, v4, s[8:9] @@ -1587,19 +1552,15 @@ ; ; GFX11-LABEL: load_mip_2d_tfe: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v8, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, v2 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v9, v8 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v8, 0 +; GFX11-NEXT: v_dual_mov_b32 v7, v2 :: v_dual_mov_b32 v6, v1 ; GFX11-NEXT: v_mov_b32_e32 v11, v8 ; GFX11-NEXT: v_mov_b32_e32 v12, v8 +; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_mov_b32_e32 v9, v8 ; GFX11-NEXT: v_mov_b32_e32 v0, v8 -; GFX11-NEXT: v_mov_b32_e32 v1, v9 -; GFX11-NEXT: v_mov_b32_e32 v2, v10 -; GFX11-NEXT: v_mov_b32_e32 v3, v11 -; GFX11-NEXT: v_mov_b32_e32 v4, v12 +; GFX11-NEXT: v_dual_mov_b32 v2, v10 :: v_dual_mov_b32 v1, v9 +; GFX11-NEXT: v_dual_mov_b32 v3, v11 :: v_dual_mov_b32 v4, v12 ; GFX11-NEXT: image_load_mip v[0:4], v[5:7], s[0:7] dmask:0xf dim:SQ_RSRC_IMG_2D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v8, v4, s[8:9] @@ -1996,15 +1957,12 @@ ; ; GFX11-LABEL: load_1d_tfe_V4_dmask3: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v5, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, 0 ; GFX11-NEXT: v_mov_b32_e32 v7, v5 ; GFX11-NEXT: v_mov_b32_e32 v8, v5 -; GFX11-NEXT: v_mov_b32_e32 v0, v5 -; GFX11-NEXT: v_mov_b32_e32 v1, v6 -; GFX11-NEXT: v_mov_b32_e32 v2, v7 -; GFX11-NEXT: v_mov_b32_e32 v3, v8 +; GFX11-NEXT: v_mov_b32_e32 v6, v5 +; GFX11-NEXT: v_dual_mov_b32 v0, v5 :: v_dual_mov_b32 v3, v8 +; GFX11-NEXT: v_dual_mov_b32 v1, v6 :: v_dual_mov_b32 v2, v7 ; GFX11-NEXT: image_load v[0:3], v4, s[0:7] dmask:0x7 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v5, v3, s[8:9] @@ -2089,13 +2047,11 @@ ; ; GFX11-LABEL: load_1d_tfe_V4_dmask2: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v4, 0 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v4 +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v6, v4 +; GFX11-NEXT: v_mov_b32_e32 v5, v4 ; GFX11-NEXT: v_mov_b32_e32 v0, v4 -; GFX11-NEXT: v_mov_b32_e32 v1, v5 -; GFX11-NEXT: v_mov_b32_e32 v2, v6 +; GFX11-NEXT: v_dual_mov_b32 v2, v6 :: v_dual_mov_b32 v1, v5 ; GFX11-NEXT: image_load v[0:2], v3, s[0:7] dmask:0x6 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v4, v2, s[8:9] @@ -2174,11 +2130,9 @@ ; ; GFX11-LABEL: load_1d_tfe_V4_dmask1: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v3, v1, s[8:9] @@ -2257,11 +2211,9 @@ ; ; GFX11-LABEL: load_1d_tfe_V2_dmask1: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v2, v0 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, v3 -; GFX11-NEXT: v_mov_b32_e32 v0, v3 -; GFX11-NEXT: v_mov_b32_e32 v1, v4 +; GFX11-NEXT: v_dual_mov_b32 v0, v3 :: v_dual_mov_b32 v1, v4 ; GFX11-NEXT: image_load v[0:1], v2, s[0:7] dmask:0x8 dim:SQ_RSRC_IMG_1D unorm tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b32 v3, v1, s[8:9] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.a16.dim.ll @@ -1,7 +1,7 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %s) { ; GFX9-LABEL: sample_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.d16.dim.ll @@ -117,8 +117,7 @@ ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo ; GFX11-NEXT: v_mov_b32_e32 v4, 0 ; GFX11-NEXT: v_mov_b32_e32 v5, v4 -; GFX11-NEXT: v_mov_b32_e32 v2, v4 -; GFX11-NEXT: v_mov_b32_e32 v3, v5 +; GFX11-NEXT: v_dual_mov_b32 v2, v4 :: v_dual_mov_b32 v3, v5 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-NEXT: image_sample v[2:3], v[0:1], s[0:7], s[8:11] dmask:0x1 dim:SQ_RSRC_IMG_2D tfe d16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -199,15 +198,24 @@ ; GFX9-NEXT: v_mov_b32_e32 v1, v5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: image_sample_c_d_1d_v2f16_tfe: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 -; GFX10PLUS-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: image_sample_c_d_1d_v2f16_tfe: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v5, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_sample_c_d_1d_v2f16_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 +; GFX11-NEXT: v_mov_b32_e32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: image_sample_c_d v[0:1], [v5, v4, v2, v3], s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %tex = call {<2 x half>,i32} @llvm.amdgcn.image.sample.c.d.1d.v2f16i32.f32.f32(i32 3, float %zcompare, float %dsdh, float %dsdv, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) %tex.vec = extractvalue {<2 x half>, i32} %tex, 0 @@ -313,20 +321,33 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: image_sample_b_2d_v3f16_tfe: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v2 -; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 -; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: image_sample_b_2d_v3f16_tfe: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_sample_b_2d_v3f16_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s12, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0x7 dim:SQ_RSRC_IMG_2D tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %tex = call {<3 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v3f16i32.f32.f32(i32 7, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) %tex.vec = extractvalue {<3 x half>, i32} %tex, 0 @@ -438,20 +459,33 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, v5 ; GFX9-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: image_sample_b_2d_v4f16_tfe: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v5, v2 -; GFX10PLUS-NEXT: v_mov_b32_e32 v4, v1 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 -; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: image_sample_b_2d_v4f16_tfe: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v5, v2 +; GFX10-NEXT: v_mov_b32_e32 v4, v1 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: image_sample_b_2d_v4f16_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s12, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_dual_mov_b32 v5, v2 :: v_dual_mov_b32 v4, v1 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-NEXT: image_sample_b v[0:2], v[3:5], s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_2D tfe d16 +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %tex = call {<4 x half>,i32} @llvm.amdgcn.image.sample.b.2d.v4f16i32.f32.f32(i32 15, float %bias, float %s, float %t, <8 x i32> %rsrc, <4 x i32> %samp, i1 false, i32 1, i32 0) %tex.vec = extractvalue {<4 x half>, i32} %tex, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.dim.ll @@ -104,17 +104,14 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: s_mov_b32 s14, exec_lo ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, v6 -; GFX11-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-NEXT: v_mov_b32_e32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -344,18 +341,30 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_12: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 -; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: sample_1d_tfe_adjust_writemask_12: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_1d_tfe_adjust_writemask_12: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s12, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0x3 dim:SQ_RSRC_IMG_1D tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -396,18 +405,30 @@ ; GFX6789-NEXT: s_waitcnt vmcnt(0) ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: sample_1d_tfe_adjust_writemask_24: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: s_mov_b32 s12, exec_lo -; GFX10PLUS-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX10PLUS-NEXT: v_mov_b32_e32 v3, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v0 -; GFX10PLUS-NEXT: s_and_b32 exec_lo, exec_lo, s12 -; GFX10PLUS-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: sample_1d_tfe_adjust_writemask_24: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: s_mov_b32 s12, exec_lo +; GFX10-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX10-NEXT: v_mov_b32_e32 v3, v0 +; GFX10-NEXT: v_mov_b32_e32 v0, 0 +; GFX10-NEXT: v_mov_b32_e32 v1, v0 +; GFX10-NEXT: v_mov_b32_e32 v2, v0 +; GFX10-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX10-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_1d_tfe_adjust_writemask_24: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: s_mov_b32 s12, exec_lo +; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo +; GFX11-NEXT: v_dual_mov_b32 v3, v0 :: v_dual_mov_b32 v0, 0 +; GFX11-NEXT: v_mov_b32_e32 v1, v0 +; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s12 +; GFX11-NEXT: image_sample v[0:2], v3, s[0:7], s[8:11] dmask:0xa dim:SQ_RSRC_IMG_1D tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<4 x float>,i32} @llvm.amdgcn.image.sample.1d.v4f32i32.f32(i32 15, float %s, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %res.vec = extractvalue {<4 x float>,i32} %v, 0 @@ -546,17 +567,14 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: s_mov_b32 s14, exec_lo ; GFX11-NEXT: s_wqm_b32 exec_lo, exec_lo -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 -; GFX11-NEXT: v_mov_b32_e32 v7, v6 -; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_dual_mov_b32 v5, v0 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v9, v6 ; GFX11-NEXT: v_mov_b32_e32 v10, v6 +; GFX11-NEXT: v_mov_b32_e32 v8, v6 +; GFX11-NEXT: v_mov_b32_e32 v7, v6 ; GFX11-NEXT: v_mov_b32_e32 v0, v6 -; GFX11-NEXT: v_mov_b32_e32 v1, v7 -; GFX11-NEXT: v_mov_b32_e32 v2, v8 -; GFX11-NEXT: v_mov_b32_e32 v3, v9 -; GFX11-NEXT: v_mov_b32_e32 v4, v10 +; GFX11-NEXT: v_dual_mov_b32 v2, v8 :: v_dual_mov_b32 v1, v7 +; GFX11-NEXT: v_dual_mov_b32 v3, v9 :: v_dual_mov_b32 v4, v10 ; GFX11-NEXT: s_and_b32 exec_lo, exec_lo, s14 ; GFX11-NEXT: image_sample v[0:4], v5, s[0:7], s[8:11] dmask:0xf dim:SQ_RSRC_IMG_1D lwe ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -1614,8 +1632,7 @@ ; GFX11: ; %bb.0: ; %main_body ; GFX11-NEXT: v_mov_b32_e32 v11, 0 ; GFX11-NEXT: v_mov_b32_e32 v12, v11 -; GFX11-NEXT: v_mov_b32_e32 v9, v11 -; GFX11-NEXT: v_mov_b32_e32 v10, v12 +; GFX11-NEXT: v_dual_mov_b32 v9, v11 :: v_dual_mov_b32 v10, v12 ; GFX11-NEXT: image_sample_c_d_o v[9:10], v[0:15], s[0:7], s[8:11] dmask:0x4 dim:SQ_RSRC_IMG_2D_ARRAY tfe ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v0, v9 @@ -1678,17 +1695,28 @@ ; GFX6789-NEXT: v_mov_b32_e32 v2, v11 ; GFX6789-NEXT: ; return to shader part epilog ; -; GFX10PLUS-LABEL: sample_c_d_o_2darray_V2_tfe: -; GFX10PLUS: ; %bb.0: ; %main_body -; GFX10PLUS-NEXT: v_mov_b32_e32 v9, 0 -; GFX10PLUS-NEXT: v_mov_b32_e32 v10, v9 -; GFX10PLUS-NEXT: v_mov_b32_e32 v11, v9 -; GFX10PLUS-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe -; GFX10PLUS-NEXT: s_waitcnt vmcnt(0) -; GFX10PLUS-NEXT: v_mov_b32_e32 v0, v9 -; GFX10PLUS-NEXT: v_mov_b32_e32 v1, v10 -; GFX10PLUS-NEXT: v_mov_b32_e32 v2, v11 -; GFX10PLUS-NEXT: ; return to shader part epilog +; GFX10-LABEL: sample_c_d_o_2darray_V2_tfe: +; GFX10: ; %bb.0: ; %main_body +; GFX10-NEXT: v_mov_b32_e32 v9, 0 +; GFX10-NEXT: v_mov_b32_e32 v10, v9 +; GFX10-NEXT: v_mov_b32_e32 v11, v9 +; GFX10-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX10-NEXT: s_waitcnt vmcnt(0) +; GFX10-NEXT: v_mov_b32_e32 v0, v9 +; GFX10-NEXT: v_mov_b32_e32 v1, v10 +; GFX10-NEXT: v_mov_b32_e32 v2, v11 +; GFX10-NEXT: ; return to shader part epilog +; +; GFX11-LABEL: sample_c_d_o_2darray_V2_tfe: +; GFX11: ; %bb.0: ; %main_body +; GFX11-NEXT: v_mov_b32_e32 v9, 0 +; GFX11-NEXT: v_mov_b32_e32 v10, v9 +; GFX11-NEXT: v_mov_b32_e32 v11, v9 +; GFX11-NEXT: image_sample_c_d_o v[9:11], v[0:15], s[0:7], s[8:11] dmask:0x6 dim:SQ_RSRC_IMG_2D_ARRAY tfe +; GFX11-NEXT: s_waitcnt vmcnt(0) +; GFX11-NEXT: v_mov_b32_e32 v2, v11 +; GFX11-NEXT: v_dual_mov_b32 v0, v9 :: v_dual_mov_b32 v1, v10 +; GFX11-NEXT: ; return to shader part epilog main_body: %v = call {<2 x float>, i32} @llvm.amdgcn.image.sample.c.d.o.2darray.v2f32i32.f32.f32(i32 6, i32 %offset, float %zcompare, float %dsdh, float %dtdh, float %dsdv, float %dtdv, float %s, float %t, float %slice, <8 x i32> %rsrc, <4 x i32> %samp, i1 0, i32 1, i32 0) %v.vec = extractvalue {<2 x float>, i32} %v, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.encode.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -show-mc-encoding < %s | FileCheck -check-prefixes=GFX11 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.image.sample.g16.ll @@ -1,6 +1,6 @@ ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py ; RUN: llc -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s -; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s +; RUN: llc -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-delay-alu=0 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX10 %s define amdgpu_ps <4 x float> @sample_d_1d(<8 x i32> inreg %rsrc, <4 x i32> inreg %samp, half %dsdh, half %dsdv, float %s) { ; GFX10-LABEL: sample_d_1d: diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.interp.inreg.ll @@ -42,9 +42,8 @@ ; GCN-NEXT: lds_param_load v2, attr2.x wait_vdst:15 ; GCN-NEXT: lds_param_load v3, attr3.x wait_vdst:15 ; GCN-NEXT: s_mov_b32 exec_lo, s3 -; GCN-NEXT: v_mov_b32_e32 v4, s0 -; GCN-NEXT: v_mov_b32_e32 v5, s1 -; GCN-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_3) | instid1(VALU_DEP_4) +; GCN-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1 +; GCN-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_4) ; GCN-NEXT: v_interp_p10_f32 v6, v0, v4, v0 wait_exp:3 ; GCN-NEXT: v_interp_p10_f32 v7, v1, v4, v1 wait_exp:2 ; GCN-NEXT: v_interp_p10_f32 v8, v2, v4, v2 wait_exp:1 diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.intersect_ray.ll @@ -62,19 +62,16 @@ ; ; GFX11-LABEL: image_bvh_intersect_ray_a16: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: s_lshr_b32 s2, s7, 16 ; GFX11-NEXT: s_lshr_b32 s3, s5, 16 -; GFX11-NEXT: v_mov_b32_e32 v2, s4 +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v7, s1 ; GFX11-NEXT: s_pack_ll_b32_b16 s2, s3, s2 ; GFX11-NEXT: s_pack_ll_b32_b16 s3, s5, s7 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v2, s4 :: v_dual_mov_b32 v3, s3 ; GFX11-NEXT: s_pack_ll_b32_b16 s4, s6, s8 -; GFX11-NEXT: v_mov_b32_e32 v3, s3 -; GFX11-NEXT: v_mov_b32_e32 v4, s2 -; GFX11-NEXT: v_mov_b32_e32 v5, s4 -; GFX11-NEXT: v_mov_b32_e32 v6, s0 -; GFX11-NEXT: v_mov_b32_e32 v7, s1 +; GFX11-NEXT: v_dual_mov_b32 v4, s2 :: v_dual_mov_b32 v5, s4 ; GFX11-NEXT: s_mov_b32 s15, s12 ; GFX11-NEXT: s_mov_b32 s14, s11 ; GFX11-NEXT: s_mov_b32 s13, s10 @@ -137,20 +134,17 @@ ; ; GFX11-LABEL: image_bvh64_intersect_ray_a16: ; GFX11: ; %bb.0: ; %main_body -; GFX11-NEXT: v_mov_b32_e32 v0, s3 -; GFX11-NEXT: v_mov_b32_e32 v6, s0 -; GFX11-NEXT: s_lshr_b32 s0, s8, 16 +; GFX11-NEXT: v_dual_mov_b32 v0, s3 :: v_dual_mov_b32 v1, s4 +; GFX11-NEXT: v_dual_mov_b32 v2, s5 :: v_dual_mov_b32 v7, s1 ; GFX11-NEXT: s_lshr_b32 s3, s6, 16 -; GFX11-NEXT: v_mov_b32_e32 v7, s1 -; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0 ; GFX11-NEXT: s_pack_ll_b32_b16 s1, s6, s8 -; GFX11-NEXT: s_pack_ll_b32_b16 s3, s7, s9 -; GFX11-NEXT: v_mov_b32_e32 v1, s4 -; GFX11-NEXT: v_mov_b32_e32 v2, s5 -; GFX11-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-NEXT: v_mov_b32_e32 v4, s0 -; GFX11-NEXT: v_mov_b32_e32 v5, s3 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_4) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: v_dual_mov_b32 v6, s0 :: v_dual_mov_b32 v3, s1 +; GFX11-NEXT: s_lshr_b32 s0, s8, 16 ; GFX11-NEXT: v_mov_b32_e32 v8, s2 +; GFX11-NEXT: s_pack_ll_b32_b16 s0, s3, s0 +; GFX11-NEXT: s_pack_ll_b32_b16 s3, s7, s9 +; GFX11-NEXT: v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s3 ; GFX11-NEXT: s_mov_b32 s15, s13 ; GFX11-NEXT: s_mov_b32 s14, s12 ; GFX11-NEXT: s_mov_b32 s13, s11 @@ -226,11 +220,9 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 0x40a00000 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, 1.0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 ; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 4.0 :: v_dual_mov_b32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) @@ -323,20 +315,19 @@ ; GFX11-NEXT: s_load_b128 s[4:7], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v2, 2, v0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v5, 2.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v1, null, s5, 0, s4 ; GFX11-NEXT: v_add_co_u32 v2, s4, s6, v2 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_add_co_ci_u32_e64 v3, null, s7, 0, s4 ; GFX11-NEXT: flat_load_b32 v6, v[0:1] ; GFX11-NEXT: flat_load_b32 v7, v[2:3] -; GFX11-NEXT: v_mov_b32_e32 v0, 0x46004200 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x47004400 +; GFX11-NEXT: v_dual_mov_b32 v0, 0x46004200 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-NEXT: image_bvh_intersect_ray v[0:3], [v6, v7, v[3:5], v[0:2]], s[0:3] a16 ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -422,14 +413,10 @@ ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41000000 -; GFX11-NEXT: v_mov_b32_e32 v3, 0x40400000 -; GFX11-NEXT: v_mov_b32_e32 v4, 4.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 0x40a00000 -; GFX11-NEXT: v_mov_b32_e32 v6, 0 -; GFX11-NEXT: v_mov_b32_e32 v7, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v8, 2.0 -; GFX11-NEXT: v_mov_b32_e32 v9, 0xb36211c7 -; GFX11-NEXT: v_mov_b32_e32 v10, 0x102 +; GFX11-NEXT: v_dual_mov_b32 v3, 0x40400000 :: v_dual_mov_b32 v4, 4.0 +; GFX11-NEXT: v_dual_mov_b32 v5, 0x40a00000 :: v_dual_mov_b32 v6, 0 +; GFX11-NEXT: v_dual_mov_b32 v8, 2.0 :: v_dual_mov_b32 v9, 0xb36211c7 +; GFX11-NEXT: v_dual_mov_b32 v10, 0x102 :: v_dual_mov_b32 v7, 1.0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) @@ -513,12 +500,9 @@ ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x24 ; GFX11-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-NEXT: s_load_b128 s[0:3], s[0:1], 0x34 -; GFX11-NEXT: v_mov_b32_e32 v2, 0x48004500 -; GFX11-NEXT: v_mov_b32_e32 v3, 0 -; GFX11-NEXT: v_mov_b32_e32 v4, 1.0 -; GFX11-NEXT: v_mov_b32_e32 v5, 2.0 -; GFX11-NEXT: v_mov_b32_e32 v6, 0xb36211c6 -; GFX11-NEXT: v_mov_b32_e32 v7, 0x102 +; GFX11-NEXT: v_dual_mov_b32 v2, 0x48004500 :: v_dual_mov_b32 v5, 2.0 +; GFX11-NEXT: v_dual_mov_b32 v4, 1.0 :: v_dual_mov_b32 v7, 0x102 +; GFX11-NEXT: v_dual_mov_b32 v6, 0xb36211c6 :: v_dual_mov_b32 v3, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: v_add_co_u32 v0, s4, s4, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.mov.dpp8.ll @@ -1,7 +1,7 @@ ; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1010 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s -; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=0 -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s +; RUN: llc -global-isel=1 -march=amdgcn -mcpu=gfx1100 -amdgpu-enable-vopd=0 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX10PLUS %s ; GFX10PLUS-LABEL: {{^}}dpp8_test: ; GFX10PLUS: v_mov_b32_e32 [[SRC:v[0-9]+]], s{{[0-9]+}} diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.permlane64.ll @@ -11,9 +11,8 @@ ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: s_load_b32 s2, s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] @@ -28,9 +27,8 @@ ; GFX11-LABEL: test_i: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v0, 0x63 -; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) +; GFX11-NEXT: v_dual_mov_b32 v0, 0x63 :: v_dual_mov_b32 v1, 0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_permlane64_b32 v0, v0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_store_b32 v1, v0, s[0:1] diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.sendmsg.rtn.ll @@ -7,9 +7,8 @@ ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -18,9 +17,8 @@ ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DOORBELL) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -34,9 +32,8 @@ ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -45,9 +42,8 @@ ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_GET_DDID) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -63,8 +59,7 @@ ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TMA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -80,8 +75,7 @@ ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_REALTIME) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -95,9 +89,8 @@ ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -106,9 +99,8 @@ ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(MSG_RTN_SAVE_WAVE) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -124,8 +116,7 @@ ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], sendmsg(MSG_RTN_GET_TBA) ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -139,9 +130,8 @@ ; GFX11-SDAG: ; %bb.0: ; GFX11-SDAG-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-SDAG-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-SDAG-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-SDAG-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-SDAG-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-SDAG-NEXT: global_store_b32 v0, v1, s[0:1] ; GFX11-SDAG-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-SDAG-NEXT: s_endpgm @@ -150,9 +140,8 @@ ; GFX11-GISEL: ; %bb.0: ; GFX11-GISEL-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 ; GFX11-GISEL-NEXT: s_sendmsg_rtn_b32 s2, sendmsg(0, 0, 0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-GISEL-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-GISEL-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-GISEL-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-GISEL-NEXT: global_store_b32 v1, v0, s[0:1] ; GFX11-GISEL-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-GISEL-NEXT: s_endpgm @@ -168,8 +157,7 @@ ; GFX11-NEXT: s_sendmsg_rtn_b64 s[2:3], 99999 ; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll --- a/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll +++ b/llvm/test/CodeGen/AMDGPU/llvm.mulo.ll @@ -81,9 +81,8 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0 @@ -226,9 +225,8 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mov_b32_e32 v5, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v4, v0 :: v_dual_mov_b32 v5, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v2, 0 ; GFX11-NEXT: v_mad_u64_u32 v[6:7], null, v4, v3, 0 ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v5, v2, 0 @@ -251,19 +249,18 @@ ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v10, vcc_lo, 0, v7, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v5 ; GFX11-NEXT: v_add3_u32 v1, v1, v9, v8 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_4) -; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_cndmask_b32_e32 v5, v7, v10, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: v_cndmask_b32_e32 v6, v6, v2, vcc_lo ; GFX11-NEXT: v_ashrrev_i32_e32 v2, 31, v1 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_sub_co_u32 v4, vcc_lo, v6, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_4) ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v7, vcc_lo, 0, v5, vcc_lo ; GFX11-NEXT: v_cmp_gt_i32_e32 vcc_lo, 0, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) | instskip(NEXT) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v4, v6, v4, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v5, v5, v7 :: v_dual_cndmask_b32 v4, v6, v4 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[4:5], v[2:3] ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -558,23 +555,21 @@ ; GFX11-NEXT: s_addc_u32 s6, 0, s6 ; GFX11-NEXT: s_sub_u32 s9, s4, s2 ; GFX11-NEXT: s_subb_u32 s10, s6, 0 -; GFX11-NEXT: v_mov_b32_e32 v1, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_3) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v1, s9 :: v_dual_mov_b32 v0, s10 ; GFX11-NEXT: s_cmp_lt_i32 s1, 0 -; GFX11-NEXT: v_mov_b32_e32 v0, s10 ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_cmp_lt_i32 s3, 0 ; GFX11-NEXT: v_cndmask_b32_e32 v2, s4, v1, vcc_lo -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cndmask_b32_e32 v0, s6, v0, vcc_lo +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_sub_co_u32 v3, vcc_lo, v2, s0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-NEXT: v_subrev_co_ci_u32_e32 v1, vcc_lo, 0, v0, vcc_lo ; GFX11-NEXT: s_cselect_b32 vcc_lo, -1, 0 ; GFX11-NEXT: s_add_i32 s1, s8, s7 ; GFX11-NEXT: s_mul_i32 s0, s0, s2 ; GFX11-NEXT: s_add_i32 s1, s1, s5 -; GFX11-NEXT: v_cndmask_b32_e32 v1, v0, v1, vcc_lo -; GFX11-NEXT: v_cndmask_b32_e32 v0, v2, v3, vcc_lo +; GFX11-NEXT: v_dual_cndmask_b32 v1, v0, v1 :: v_dual_cndmask_b32 v0, v2, v3 ; GFX11-NEXT: s_ashr_i32 s4, s1, 31 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_mov_b32 s5, s4 @@ -641,9 +636,8 @@ ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_ashrrev_i64 v[5:6], 2, v[4:5] ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[5:6], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_4) -; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -696,15 +690,12 @@ ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_and_b32_e32 v7, 0x3fffffff, v1 -; GFX11-NEXT: v_mov_b32_e32 v6, v0 +; GFX11-NEXT: v_dual_mov_b32 v6, v0 :: v_dual_and_b32 v7, 0x3fffffff, v1 ; GFX11-NEXT: v_lshlrev_b64 v[4:5], 2, v[0:1] ; GFX11-NEXT: v_alignbit_b32 v3, v1, v0, 30 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_cmp_ne_u64_e32 vcc_lo, v[6:7], v[0:1] -; GFX11-NEXT: v_mov_b32_e32 v0, v4 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_3) -; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: v_dual_mov_b32 v0, v4 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: v_cndmask_b32_e64 v2, 0, 1, vcc_lo ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: diff --git a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_64_32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_64_32.ll @@ -32,8 +32,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -70,8 +69,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -108,8 +106,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -146,8 +143,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -254,8 +250,7 @@ ; GFX11-NEXT: v_ashrrev_i32_e32 v15, 31, v1 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[9:10], null, v14, v1, v[7:8] -; GFX11-NEXT: v_mov_b32_e32 v7, v10 -; GFX11-NEXT: v_mov_b32_e32 v10, v8 +; GFX11-NEXT: v_dual_mov_b32 v7, v10 :: v_dual_mov_b32 v10, v8 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mad_u64_u32 v[11:12], null, v0, v15, v[9:10] ; GFX11-NEXT: v_mad_i64_i32 v[9:10], null, v1, v14, 0 @@ -310,8 +305,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_i64_i32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -405,8 +399,7 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v4, v1 -; GFX11-NEXT: v_mov_b32_e32 v5, v0 +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_mov_b32 v5, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v5, v4, v[2:3] ; GFX11-NEXT: v_ashrrev_i32_e32 v5, 31, v5 @@ -494,14 +487,12 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v3, v2 -; GFX11-NEXT: v_mov_b32_e32 v2, v0 +; GFX11-NEXT: v_dual_mov_b32 v3, v2 :: v_dual_mov_b32 v2, v0 ; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] -; GFX11-NEXT: v_and_b32_e32 v5, 1, v6 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_mov_b32_e32 v4, v1 +; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v2, v3, v[4:5] +; GFX11-NEXT: v_dual_mov_b32 v4, v1 :: v_dual_and_b32 v5, 1, v6 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v5, v3, v[4:5] ; GFX11-NEXT: s_setpc_b64 s[30:31] %trunc.lhs = and i64 %arg0, 8589934591 @@ -550,10 +541,9 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mov_b32_e32 v6, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_2) +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v6, v2, v[4:5] -; GFX11-NEXT: v_and_b32_e32 v4, 1, v3 -; GFX11-NEXT: v_mov_b32_e32 v3, v1 +; GFX11-NEXT: v_dual_mov_b32 v3, v1 :: v_dual_and_b32 v4, 1, v3 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_mad_u64_u32 v[1:2], null, v6, v4, v[3:4] ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -631,9 +621,8 @@ ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: v_mad_u64_u32 v[2:3], null, v1, v0, v[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_2) -; GFX11-NEXT: v_mov_b32_e32 v0, v2 -; GFX11-NEXT: v_mov_b32_e32 v1, v3 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) +; GFX11-NEXT: v_dual_mov_b32 v0, v2 :: v_dual_mov_b32 v1, v3 ; GFX11-NEXT: s_setpc_b64 s[30:31] %tmp4 = lshr i64 %arg0, 32 %tmp5 = and i64 %arg0, 4294967295 @@ -697,14 +686,13 @@ ; GFX11-NEXT: s_load_b64 s[2:3], s[0:1], 0x2c ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x34 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x24 -; GFX11-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_mul_i32 s6, s2, s3 ; GFX11-NEXT: s_mul_hi_u32 s3, s2, s3 ; GFX11-NEXT: s_add_u32 s2, s6, s4 ; GFX11-NEXT: s_addc_u32 s3, s3, s5 ; GFX11-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v1, s3 ; GFX11-NEXT: global_store_b64 v2, v[0:1], s[0:1] ; GFX11-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-NEXT: s_endpgm @@ -935,9 +923,8 @@ ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX11-NEXT: v_mov_b32_e32 v6, v1 -; GFX11-NEXT: v_mov_b32_e32 v7, v0 -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_4) +; GFX11-NEXT: v_dual_mov_b32 v6, v1 :: v_dual_mov_b32 v7, v0 +; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_1) | instid1(VALU_DEP_3) ; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v7, v2, v[4:5] ; GFX11-NEXT: v_mul_lo_u32 v3, v7, v3 ; GFX11-NEXT: v_mul_lo_u32 v2, v6, v2 diff --git a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll --- a/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll +++ b/llvm/test/CodeGen/AMDGPU/mad_u64_u32.ll @@ -15,13 +15,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, v[2:3] ; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: mad_i32_vvv: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v3, v1 -; GFX11-NEXT: v_mov_b32_e32 v4, v0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v4, v3, v[2:3] -; GFX11-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float @@ -51,13 +44,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, 42 ; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: mad_i32_vvc: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 42 -; GFX11-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, 42 %cast = bitcast i32 %add to float @@ -76,13 +62,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, 0x12d687 ; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: mad_i32_vvi: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, 0x12d687 -; GFX11-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, 1234567 %cast = bitcast i32 %add to float @@ -143,13 +122,6 @@ ; GFX10: ; %bb.0: ; GFX10-NEXT: v_mad_u64_u32 v[0:1], null, v0, v1, s[0:1] ; GFX10-NEXT: ; return to shader part epilog -; -; GFX11-LABEL: mad_i32_vvs: -; GFX11: ; %bb.0: -; GFX11-NEXT: v_mov_b32_e32 v2, v1 -; GFX11-NEXT: v_mov_b32_e32 v3, v0 -; GFX11-NEXT: v_mad_u64_u32 v[0:1], null, v3, v2, s[0:1] -; GFX11-NEXT: ; return to shader part epilog %mul = mul i32 %a, %b %add = add i32 %mul, %c %cast = bitcast i32 %add to float diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-agent.ll @@ -119,26 +119,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -256,26 +254,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -402,30 +398,28 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -562,34 +556,32 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -691,10 +683,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_unordered_store: @@ -703,10 +695,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -807,10 +799,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_monotonic_store: @@ -819,10 +811,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -935,12 +927,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_release_store: @@ -949,12 +941,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -1067,12 +1059,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_seq_cst_store: @@ -1081,12 +1073,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -1187,10 +1179,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_monotonic_atomicrmw: @@ -1199,10 +1191,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1322,8 +1314,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1338,8 +1329,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1458,12 +1448,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_release_atomicrmw: @@ -1472,12 +1462,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1609,8 +1599,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1627,8 +1616,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1768,8 +1756,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1786,8 +1773,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1921,14 +1907,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_acquire_ret_atomicrmw: @@ -1937,14 +1923,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2083,8 +2069,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2093,6 +2078,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_acq_rel_ret_atomicrmw: @@ -2101,8 +2087,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2111,6 +2096,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2249,8 +2235,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2259,6 +2244,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_seq_cst_ret_atomicrmw: @@ -2267,8 +2253,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2277,6 +2262,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2390,11 +2376,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_cmpxchg: @@ -2403,11 +2388,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2540,10 +2524,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2557,10 +2539,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2691,13 +2671,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_release_monotonic_cmpxchg: @@ -2706,13 +2685,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2857,10 +2835,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2876,10 +2852,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3031,10 +3005,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3050,10 +3022,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3193,10 +3163,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3210,10 +3178,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3351,10 +3317,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3368,10 +3332,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3521,10 +3483,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3540,10 +3500,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3695,10 +3653,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3714,10 +3670,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3869,10 +3823,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3888,10 +3840,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4043,10 +3993,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4062,10 +4010,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4217,10 +4163,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4236,10 +4180,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4391,10 +4333,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4410,10 +4350,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4565,10 +4503,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4584,10 +4520,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4739,10 +4673,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4758,10 +4690,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4906,13 +4836,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_monotonic_monotonic_ret_cmpxchg: @@ -4921,13 +4850,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5076,15 +5004,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_acquire_monotonic_ret_cmpxchg: @@ -5093,15 +5020,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5253,15 +5179,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_release_monotonic_ret_cmpxchg: @@ -5270,15 +5195,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5439,10 +5363,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5450,6 +5372,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_acq_rel_monotonic_ret_cmpxchg: @@ -5458,10 +5381,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5469,6 +5390,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5629,10 +5551,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5640,6 +5560,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_seq_cst_monotonic_ret_cmpxchg: @@ -5648,10 +5569,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5659,6 +5578,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5807,15 +5727,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_monotonic_acquire_ret_cmpxchg: @@ -5824,15 +5743,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5981,15 +5899,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_acquire_acquire_ret_cmpxchg: @@ -5998,15 +5915,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6167,10 +6083,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6178,6 +6092,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_release_acquire_ret_cmpxchg: @@ -6186,10 +6101,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6197,6 +6110,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6357,10 +6271,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6368,6 +6280,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_acq_rel_acquire_ret_cmpxchg: @@ -6376,10 +6289,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6387,6 +6298,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6547,10 +6459,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6558,6 +6468,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_seq_cst_acquire_ret_cmpxchg: @@ -6566,10 +6477,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6577,6 +6486,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6737,10 +6647,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6748,6 +6656,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_monotonic_seq_cst_ret_cmpxchg: @@ -6756,10 +6665,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6767,6 +6674,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6927,10 +6835,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6938,6 +6844,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_acquire_seq_cst_ret_cmpxchg: @@ -6946,10 +6853,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6957,6 +6862,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -7117,10 +7023,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7128,6 +7032,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_release_seq_cst_ret_cmpxchg: @@ -7136,10 +7041,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7147,6 +7050,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -7307,10 +7211,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7318,6 +7220,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_acq_rel_seq_cst_ret_cmpxchg: @@ -7326,10 +7229,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7337,6 +7238,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -7497,10 +7399,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7508,6 +7408,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_seq_cst_seq_cst_ret_cmpxchg: @@ -7516,10 +7417,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7527,6 +7426,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -7646,26 +7546,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -7783,26 +7681,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -7935,32 +7831,30 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -8103,36 +7997,34 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -8234,10 +8126,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_unordered_store: @@ -8246,10 +8138,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -8350,10 +8242,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_store: @@ -8362,10 +8254,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -8478,12 +8370,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_release_store: @@ -8492,12 +8384,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -8610,12 +8502,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_store: @@ -8624,12 +8516,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -8730,10 +8622,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_atomicrmw: @@ -8742,10 +8634,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8863,8 +8755,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8878,8 +8769,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8997,12 +8887,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_release_atomicrmw: @@ -9011,12 +8901,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -9146,8 +9036,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9163,8 +9052,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9301,8 +9189,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9318,8 +9205,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9457,8 +9343,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -9466,6 +9351,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_ret_atomicrmw: @@ -9474,8 +9360,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -9483,6 +9368,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -9626,8 +9512,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9637,6 +9522,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_ret_atomicrmw: @@ -9645,8 +9531,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9656,6 +9541,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -9799,8 +9685,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9810,6 +9695,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_ret_atomicrmw: @@ -9818,8 +9704,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9829,6 +9714,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -9942,11 +9828,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_cmpxchg: @@ -9955,11 +9840,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10090,10 +9974,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10106,10 +9988,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10239,13 +10119,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_cmpxchg: @@ -10254,13 +10133,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10403,10 +10281,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10421,10 +10297,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10573,10 +10447,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10591,10 +10463,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10731,10 +10601,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10747,10 +10615,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10885,10 +10751,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10901,10 +10765,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11051,10 +10913,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11069,10 +10929,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11221,10 +11079,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11239,10 +11095,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11391,10 +11245,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11409,10 +11261,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11561,10 +11411,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11579,10 +11427,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11731,10 +11577,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11749,10 +11593,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11901,10 +11743,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11919,10 +11759,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12071,10 +11909,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12089,10 +11925,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12241,10 +12075,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12259,10 +12091,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12406,13 +12236,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_monotonic_ret_cmpxchg: @@ -12421,13 +12250,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12582,16 +12410,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_monotonic_ret_cmpxchg: @@ -12600,16 +12427,15 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12761,15 +12587,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_release_monotonic_ret_cmpxchg: @@ -12778,15 +12603,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12953,10 +12777,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12965,6 +12787,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -12973,10 +12796,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12985,6 +12806,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13151,10 +12973,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13163,6 +12983,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -13171,10 +12992,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13183,6 +13002,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13337,16 +13157,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_acquire_ret_cmpxchg: @@ -13355,16 +13174,15 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13519,16 +13337,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_acquire_ret_cmpxchg: @@ -13537,16 +13354,15 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13713,10 +13529,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13725,6 +13539,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_release_acquire_ret_cmpxchg: @@ -13733,10 +13548,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13745,6 +13558,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13911,10 +13725,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13923,6 +13735,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_acquire_ret_cmpxchg: @@ -13931,10 +13744,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13943,6 +13754,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -14109,10 +13921,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14121,6 +13931,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_acquire_ret_cmpxchg: @@ -14129,10 +13940,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14141,6 +13950,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -14307,10 +14117,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14319,6 +14127,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_monotonic_seq_cst_ret_cmpxchg: @@ -14327,10 +14136,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14339,6 +14146,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -14505,10 +14313,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14517,6 +14323,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_acquire_seq_cst_ret_cmpxchg: @@ -14525,10 +14332,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14537,6 +14342,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -14703,10 +14509,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14715,6 +14519,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_release_seq_cst_ret_cmpxchg: @@ -14723,10 +14528,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14735,6 +14538,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -14901,10 +14705,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14913,6 +14715,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: @@ -14921,10 +14724,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14933,6 +14734,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -15099,10 +14901,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15111,6 +14911,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: @@ -15119,10 +14920,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15131,6 +14930,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-nontemporal.ll @@ -119,26 +119,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -273,10 +271,10 @@ ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] slc dlc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_load_1: @@ -288,10 +286,10 @@ ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s0, v0 ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] slc dlc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -411,26 +409,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -559,31 +555,31 @@ ; GFX11-WGP-LABEL: flat_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 glc slc dlc +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-singlethread.ll @@ -119,26 +119,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -256,26 +254,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -393,26 +389,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -530,26 +524,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -651,10 +643,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_unordered_store: @@ -663,10 +655,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -767,10 +759,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_store: @@ -779,10 +771,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -883,10 +875,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_store: @@ -895,10 +887,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -999,10 +991,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_store: @@ -1011,10 +1003,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -1115,10 +1107,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_atomicrmw: @@ -1127,10 +1119,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1231,10 +1223,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_atomicrmw: @@ -1243,10 +1235,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1347,10 +1339,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_atomicrmw: @@ -1359,10 +1351,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1463,10 +1455,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_atomicrmw: @@ -1475,10 +1467,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1579,10 +1571,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_atomicrmw: @@ -1591,10 +1583,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1711,12 +1703,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_ret_atomicrmw: @@ -1725,12 +1717,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1848,12 +1840,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_ret_atomicrmw: @@ -1862,12 +1854,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1985,12 +1977,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_ret_atomicrmw: @@ -1999,12 +1991,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2118,11 +2110,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_cmpxchg: @@ -2131,11 +2122,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2249,11 +2239,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_cmpxchg: @@ -2262,11 +2251,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2380,11 +2368,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_monotonic_cmpxchg: @@ -2393,11 +2380,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2511,11 +2497,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_cmpxchg: @@ -2524,11 +2509,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2642,11 +2626,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_cmpxchg: @@ -2655,11 +2638,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2773,11 +2755,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_cmpxchg: @@ -2786,11 +2767,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2904,11 +2884,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_cmpxchg: @@ -2917,11 +2896,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3035,11 +3013,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_acquire_cmpxchg: @@ -3048,11 +3025,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3166,11 +3142,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_cmpxchg: @@ -3179,11 +3154,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3297,11 +3271,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_cmpxchg: @@ -3310,11 +3283,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3428,11 +3400,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_cmpxchg: @@ -3441,11 +3412,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3559,11 +3529,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_cmpxchg: @@ -3572,11 +3541,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3690,11 +3658,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_cmpxchg: @@ -3703,11 +3670,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3821,11 +3787,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_cmpxchg: @@ -3834,11 +3799,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3952,11 +3916,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_cmpxchg: @@ -3965,11 +3928,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4107,13 +4069,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_monotonic_ret_cmpxchg: @@ -4122,13 +4083,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4268,13 +4228,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_monotonic_ret_cmpxchg: @@ -4283,13 +4242,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4429,13 +4387,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_monotonic_ret_cmpxchg: @@ -4444,13 +4401,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4590,13 +4546,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_monotonic_ret_cmpxchg: @@ -4605,13 +4560,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4751,13 +4705,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_monotonic_ret_cmpxchg: @@ -4766,13 +4719,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4912,13 +4864,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_acquire_ret_cmpxchg: @@ -4927,13 +4878,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5073,13 +5023,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_acquire_ret_cmpxchg: @@ -5088,13 +5037,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5234,13 +5182,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_acquire_ret_cmpxchg: @@ -5249,13 +5196,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5395,13 +5341,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_acquire_ret_cmpxchg: @@ -5410,13 +5355,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5556,13 +5500,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_acquire_ret_cmpxchg: @@ -5571,13 +5514,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5717,13 +5659,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_monotonic_seq_cst_ret_cmpxchg: @@ -5732,13 +5673,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5878,13 +5818,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acquire_seq_cst_ret_cmpxchg: @@ -5893,13 +5832,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6039,13 +5977,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_release_seq_cst_ret_cmpxchg: @@ -6054,13 +5991,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6200,13 +6136,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_acq_rel_seq_cst_ret_cmpxchg: @@ -6215,13 +6150,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6361,13 +6295,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_seq_cst_seq_cst_ret_cmpxchg: @@ -6376,13 +6309,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6502,26 +6434,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -6639,26 +6569,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -6776,26 +6704,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -6913,26 +6839,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -7034,10 +6958,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_unordered_store: @@ -7046,10 +6970,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7150,10 +7074,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_store: @@ -7162,10 +7086,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7266,10 +7190,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_store: @@ -7278,10 +7202,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7382,10 +7306,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_store: @@ -7394,10 +7318,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7498,10 +7422,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_atomicrmw: @@ -7510,10 +7434,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -7614,10 +7538,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_atomicrmw: @@ -7626,10 +7550,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -7730,10 +7654,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_atomicrmw: @@ -7742,10 +7666,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -7846,10 +7770,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_atomicrmw: @@ -7858,10 +7782,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -7962,10 +7886,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_atomicrmw: @@ -7974,10 +7898,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8094,12 +8018,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_ret_atomicrmw: @@ -8108,12 +8032,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8231,12 +8155,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_ret_atomicrmw: @@ -8245,12 +8169,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8368,12 +8292,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_ret_atomicrmw: @@ -8382,12 +8306,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8501,11 +8425,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_cmpxchg: @@ -8514,11 +8437,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -8632,11 +8554,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -8645,11 +8566,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -8763,11 +8683,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_cmpxchg: @@ -8776,11 +8695,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -8894,11 +8812,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -8907,11 +8824,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9025,11 +8941,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -9038,11 +8953,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9156,11 +9070,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -9169,11 +9082,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9287,11 +9199,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_cmpxchg: @@ -9300,11 +9211,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9418,11 +9328,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_cmpxchg: @@ -9431,11 +9340,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9549,11 +9457,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -9562,11 +9469,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9680,11 +9586,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -9693,11 +9598,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9811,11 +9715,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -9824,11 +9727,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9942,11 +9844,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -9955,11 +9856,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10073,11 +9973,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_cmpxchg: @@ -10086,11 +9985,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10204,11 +10102,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -10217,11 +10114,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10335,11 +10231,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -10348,11 +10243,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10490,13 +10384,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: @@ -10505,13 +10398,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10651,13 +10543,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_monotonic_ret_cmpxchg: @@ -10666,13 +10557,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10812,13 +10702,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_monotonic_ret_cmpxchg: @@ -10827,13 +10716,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10973,13 +10861,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -10988,13 +10875,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11134,13 +11020,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -11149,13 +11034,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11295,13 +11179,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_acquire_ret_cmpxchg: @@ -11310,13 +11193,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11456,13 +11338,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_acquire_ret_cmpxchg: @@ -11471,13 +11352,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11617,13 +11497,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_acquire_ret_cmpxchg: @@ -11632,13 +11511,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11778,13 +11656,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: @@ -11793,13 +11670,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11939,13 +11815,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: @@ -11954,13 +11829,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12100,13 +11974,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: @@ -12115,13 +11988,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12261,13 +12133,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: @@ -12276,13 +12147,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12422,13 +12292,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_release_seq_cst_ret_cmpxchg: @@ -12437,13 +12306,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12583,13 +12451,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: @@ -12598,13 +12465,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12744,13 +12610,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: @@ -12759,13 +12624,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-system.ll @@ -119,26 +119,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -256,26 +254,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -404,30 +400,28 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -566,34 +560,32 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -695,10 +687,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_unordered_store: @@ -707,10 +699,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -811,10 +803,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_store: @@ -823,10 +815,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -941,12 +933,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_release_store: @@ -955,12 +947,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -1075,12 +1067,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_seq_cst_store: @@ -1089,12 +1081,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -1195,10 +1187,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_atomicrmw: @@ -1207,10 +1199,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1332,8 +1324,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1348,8 +1339,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1470,12 +1460,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_release_atomicrmw: @@ -1484,12 +1474,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1625,8 +1615,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1643,8 +1632,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1788,8 +1776,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1806,8 +1793,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1943,14 +1929,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acquire_ret_atomicrmw: @@ -1959,14 +1945,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2109,8 +2095,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2119,6 +2104,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acq_rel_ret_atomicrmw: @@ -2127,8 +2113,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2137,6 +2122,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2279,8 +2265,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2289,6 +2274,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_seq_cst_ret_atomicrmw: @@ -2297,8 +2283,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2307,6 +2292,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2420,11 +2406,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_monotonic_cmpxchg: @@ -2433,11 +2418,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2572,10 +2556,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2589,10 +2571,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2725,13 +2705,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_release_monotonic_cmpxchg: @@ -2740,13 +2719,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2895,10 +2873,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2914,10 +2890,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3073,10 +3047,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3092,10 +3064,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3237,10 +3207,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3254,10 +3222,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3397,10 +3363,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3414,10 +3378,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3571,10 +3533,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3590,10 +3550,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3749,10 +3707,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3768,10 +3724,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3927,10 +3881,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3946,10 +3898,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4105,10 +4055,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4124,10 +4072,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4283,10 +4229,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4302,10 +4246,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4461,10 +4403,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4480,10 +4420,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4639,10 +4577,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4658,10 +4594,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4817,10 +4751,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4836,10 +4768,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -4984,13 +4914,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_monotonic_ret_cmpxchg: @@ -4999,13 +4928,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5156,15 +5084,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acquire_monotonic_ret_cmpxchg: @@ -5173,15 +5100,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5335,15 +5261,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_release_monotonic_ret_cmpxchg: @@ -5352,15 +5277,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5525,10 +5449,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5536,6 +5458,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acq_rel_monotonic_ret_cmpxchg: @@ -5544,10 +5467,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5555,6 +5476,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5719,10 +5641,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5730,6 +5650,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_seq_cst_monotonic_ret_cmpxchg: @@ -5738,10 +5659,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -5749,6 +5668,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5899,15 +5819,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_acquire_ret_cmpxchg: @@ -5916,15 +5835,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6075,15 +5993,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acquire_acquire_ret_cmpxchg: @@ -6092,15 +6009,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6265,10 +6181,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6276,6 +6190,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_release_acquire_ret_cmpxchg: @@ -6284,10 +6199,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6295,6 +6208,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6459,10 +6373,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6470,6 +6382,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acq_rel_acquire_ret_cmpxchg: @@ -6478,10 +6391,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6489,6 +6400,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6653,10 +6565,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6664,6 +6574,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_seq_cst_acquire_ret_cmpxchg: @@ -6672,10 +6583,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6683,6 +6592,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6847,10 +6757,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6858,6 +6766,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_monotonic_seq_cst_ret_cmpxchg: @@ -6866,10 +6775,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -6877,6 +6784,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -7041,10 +6949,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7052,6 +6958,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acquire_seq_cst_ret_cmpxchg: @@ -7060,10 +6967,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7071,6 +6976,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -7235,10 +7141,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7246,6 +7150,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_release_seq_cst_ret_cmpxchg: @@ -7254,10 +7159,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7265,6 +7168,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -7429,10 +7333,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7440,6 +7342,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_acq_rel_seq_cst_ret_cmpxchg: @@ -7448,10 +7351,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7459,6 +7360,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -7623,10 +7525,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7634,6 +7534,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_seq_cst_seq_cst_ret_cmpxchg: @@ -7642,10 +7543,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -7653,6 +7552,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -7772,26 +7672,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -7909,26 +7807,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -8063,32 +7959,30 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -8233,36 +8127,34 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -8364,10 +8256,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_unordered_store: @@ -8376,10 +8268,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -8480,10 +8372,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_store: @@ -8492,10 +8384,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -8610,12 +8502,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_release_store: @@ -8624,12 +8516,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -8744,12 +8636,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_store: @@ -8758,12 +8650,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -8864,10 +8756,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_atomicrmw: @@ -8876,10 +8768,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8999,8 +8891,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9014,8 +8905,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9135,12 +9025,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_release_atomicrmw: @@ -9149,12 +9039,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -9288,8 +9178,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9305,8 +9194,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9447,8 +9335,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9464,8 +9351,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9605,8 +9491,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) @@ -9614,6 +9499,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_ret_atomicrmw: @@ -9622,8 +9508,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) @@ -9631,6 +9516,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -9778,8 +9664,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9789,6 +9674,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_ret_atomicrmw: @@ -9797,8 +9683,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9808,6 +9693,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -9955,8 +9841,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9966,6 +9851,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_ret_atomicrmw: @@ -9974,8 +9860,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 @@ -9985,6 +9870,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -10098,11 +9984,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_cmpxchg: @@ -10111,11 +9996,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10248,10 +10132,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10264,10 +10146,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10399,13 +10279,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_cmpxchg: @@ -10414,13 +10293,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10567,10 +10445,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10585,10 +10461,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10741,10 +10615,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10759,10 +10631,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10901,10 +10771,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10917,10 +10785,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11057,10 +10923,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11073,10 +10937,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11227,10 +11089,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11245,10 +11105,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11401,10 +11259,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11419,10 +11275,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11575,10 +11429,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11593,10 +11445,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11749,10 +11599,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11767,10 +11615,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11923,10 +11769,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -11941,10 +11785,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12097,10 +11939,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12115,10 +11955,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12271,10 +12109,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12289,10 +12125,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12445,10 +12279,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12463,10 +12295,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -12610,13 +12440,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_monotonic_ret_cmpxchg: @@ -12625,13 +12454,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12788,16 +12616,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_monotonic_ret_cmpxchg: @@ -12806,16 +12633,15 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12969,15 +12795,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_release_monotonic_ret_cmpxchg: @@ -12986,15 +12811,14 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13165,10 +12989,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13177,6 +12999,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -13185,10 +13008,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13197,6 +13018,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13367,10 +13189,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13379,6 +13199,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -13387,10 +13208,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13399,6 +13218,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13555,16 +13375,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_acquire_ret_cmpxchg: @@ -13573,16 +13392,15 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13739,16 +13557,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_acquire_ret_cmpxchg: @@ -13757,16 +13574,15 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13937,10 +13753,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13949,6 +13763,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_release_acquire_ret_cmpxchg: @@ -13957,10 +13772,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13969,6 +13782,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -14139,10 +13953,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14151,6 +13963,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_acquire_ret_cmpxchg: @@ -14159,10 +13972,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14171,6 +13982,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -14341,10 +14153,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14353,6 +14163,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_acquire_ret_cmpxchg: @@ -14361,10 +14172,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14373,6 +14182,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -14543,10 +14353,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14555,6 +14363,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_monotonic_seq_cst_ret_cmpxchg: @@ -14563,10 +14372,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14575,6 +14382,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -14745,10 +14553,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14757,6 +14563,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acquire_seq_cst_ret_cmpxchg: @@ -14765,10 +14572,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14777,6 +14582,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -14947,10 +14753,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14959,6 +14763,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_release_seq_cst_ret_cmpxchg: @@ -14967,10 +14772,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -14979,6 +14782,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -15149,10 +14953,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15161,6 +14963,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_acq_rel_seq_cst_ret_cmpxchg: @@ -15169,10 +14972,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15181,6 +14982,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -15351,10 +15153,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15363,6 +15163,7 @@ ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_system_one_as_seq_cst_seq_cst_ret_cmpxchg: @@ -15371,10 +15172,8 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -15383,6 +15182,7 @@ ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-volatile.ll @@ -67,28 +67,26 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -170,10 +168,10 @@ ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_load_1: @@ -186,10 +184,10 @@ ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s1, 0, s0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s3 :: v_dual_mov_b32 v0, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -261,28 +259,26 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -357,33 +353,33 @@ ; GFX11-WGP-LABEL: flat_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-WGP-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-WGP-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-WGP-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s1 -; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_3) +; GFX11-CU-NEXT: s_delay_alu instid0(VALU_DEP_2) ; GFX11-CU-NEXT: v_add_co_u32 v0, s0, s2, v0 ; GFX11-CU-NEXT: flat_load_b32 v2, v[1:2] ; GFX11-CU-NEXT: v_add_co_ci_u32_e64 v1, null, s3, 0, s0 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -455,28 +451,26 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_volatile_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -543,12 +537,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_volatile_workgroup_release_store: @@ -557,11 +551,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-wavefront.ll @@ -119,26 +119,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -256,26 +254,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -393,26 +389,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -530,26 +524,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -651,10 +643,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_unordered_store: @@ -663,10 +655,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -767,10 +759,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_store: @@ -779,10 +771,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -883,10 +875,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_store: @@ -895,10 +887,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -999,10 +991,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_store: @@ -1011,10 +1003,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -1115,10 +1107,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_atomicrmw: @@ -1127,10 +1119,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1231,10 +1223,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_atomicrmw: @@ -1243,10 +1235,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1347,10 +1339,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_atomicrmw: @@ -1359,10 +1351,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1463,10 +1455,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_atomicrmw: @@ -1475,10 +1467,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1579,10 +1571,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_atomicrmw: @@ -1591,10 +1583,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1711,12 +1703,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_ret_atomicrmw: @@ -1725,12 +1717,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1848,12 +1840,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_ret_atomicrmw: @@ -1862,12 +1854,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1985,12 +1977,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_ret_atomicrmw: @@ -1999,12 +1991,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2118,11 +2110,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_cmpxchg: @@ -2131,11 +2122,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2249,11 +2239,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_cmpxchg: @@ -2262,11 +2251,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2380,11 +2368,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_monotonic_cmpxchg: @@ -2393,11 +2380,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2511,11 +2497,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_cmpxchg: @@ -2524,11 +2509,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2642,11 +2626,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_cmpxchg: @@ -2655,11 +2638,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2773,11 +2755,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_cmpxchg: @@ -2786,11 +2767,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2904,11 +2884,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_cmpxchg: @@ -2917,11 +2896,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3035,11 +3013,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_acquire_cmpxchg: @@ -3048,11 +3025,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3166,11 +3142,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_cmpxchg: @@ -3179,11 +3154,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3297,11 +3271,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_cmpxchg: @@ -3310,11 +3283,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3428,11 +3400,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_cmpxchg: @@ -3441,11 +3412,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3559,11 +3529,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_cmpxchg: @@ -3572,11 +3541,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3690,11 +3658,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_cmpxchg: @@ -3703,11 +3670,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3821,11 +3787,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_cmpxchg: @@ -3834,11 +3799,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3952,11 +3916,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_cmpxchg: @@ -3965,11 +3928,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4107,13 +4069,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_monotonic_ret_cmpxchg: @@ -4122,13 +4083,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4268,13 +4228,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_monotonic_ret_cmpxchg: @@ -4283,13 +4242,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4429,13 +4387,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_monotonic_ret_cmpxchg: @@ -4444,13 +4401,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4590,13 +4546,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_monotonic_ret_cmpxchg: @@ -4605,13 +4560,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4751,13 +4705,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_monotonic_ret_cmpxchg: @@ -4766,13 +4719,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4912,13 +4864,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_acquire_ret_cmpxchg: @@ -4927,13 +4878,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5073,13 +5023,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_acquire_ret_cmpxchg: @@ -5088,13 +5037,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5234,13 +5182,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_acquire_ret_cmpxchg: @@ -5249,13 +5196,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5395,13 +5341,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_acquire_ret_cmpxchg: @@ -5410,13 +5355,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5556,13 +5500,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_acquire_ret_cmpxchg: @@ -5571,13 +5514,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5717,13 +5659,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_monotonic_seq_cst_ret_cmpxchg: @@ -5732,13 +5673,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5878,13 +5818,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acquire_seq_cst_ret_cmpxchg: @@ -5893,13 +5832,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6039,13 +5977,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_release_seq_cst_ret_cmpxchg: @@ -6054,13 +5991,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6200,13 +6136,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_acq_rel_seq_cst_ret_cmpxchg: @@ -6215,13 +6150,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6361,13 +6295,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_seq_cst_seq_cst_ret_cmpxchg: @@ -6376,13 +6309,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6502,26 +6434,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -6639,26 +6569,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -6776,26 +6704,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -6913,26 +6839,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -7034,10 +6958,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_unordered_store: @@ -7046,10 +6970,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7150,10 +7074,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_store: @@ -7162,10 +7086,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7266,10 +7190,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_store: @@ -7278,10 +7202,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7382,10 +7306,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_store: @@ -7394,10 +7318,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7498,10 +7422,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_atomicrmw: @@ -7510,10 +7434,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -7614,10 +7538,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_atomicrmw: @@ -7626,10 +7550,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -7730,10 +7654,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_atomicrmw: @@ -7742,10 +7666,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -7846,10 +7770,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_atomicrmw: @@ -7858,10 +7782,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -7962,10 +7886,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_atomicrmw: @@ -7974,10 +7898,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8094,12 +8018,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_ret_atomicrmw: @@ -8108,12 +8032,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8231,12 +8155,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_ret_atomicrmw: @@ -8245,12 +8169,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8368,12 +8292,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_ret_atomicrmw: @@ -8382,12 +8306,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8501,11 +8425,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_cmpxchg: @@ -8514,11 +8437,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -8632,11 +8554,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -8645,11 +8566,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -8763,11 +8683,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_monotonic_cmpxchg: @@ -8776,11 +8695,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -8894,11 +8812,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -8907,11 +8824,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9025,11 +8941,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -9038,11 +8953,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9156,11 +9070,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -9169,11 +9082,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9287,11 +9199,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_cmpxchg: @@ -9300,11 +9211,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9418,11 +9328,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_cmpxchg: @@ -9431,11 +9340,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9549,11 +9457,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -9562,11 +9469,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9680,11 +9586,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -9693,11 +9598,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9811,11 +9715,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -9824,11 +9727,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9942,11 +9844,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -9955,11 +9856,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10073,11 +9973,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_cmpxchg: @@ -10086,11 +9985,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10204,11 +10102,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -10217,11 +10114,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10335,11 +10231,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -10348,11 +10243,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10490,13 +10384,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: @@ -10505,13 +10398,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10651,13 +10543,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_monotonic_ret_cmpxchg: @@ -10666,13 +10557,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10812,13 +10702,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -10827,13 +10716,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10973,13 +10861,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -10988,13 +10875,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11134,13 +11020,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_acquire_ret_cmpxchg: @@ -11149,13 +11034,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11295,13 +11179,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_acquire_ret_cmpxchg: @@ -11310,13 +11193,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11456,13 +11338,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_acquire_ret_cmpxchg: @@ -11471,13 +11352,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11617,13 +11497,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: @@ -11632,13 +11511,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11778,13 +11656,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: @@ -11793,13 +11670,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11939,13 +11815,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: @@ -11954,13 +11829,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12100,13 +11974,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: @@ -12115,13 +11988,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12261,13 +12133,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_release_seq_cst_ret_cmpxchg: @@ -12276,13 +12147,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12422,13 +12292,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_acq_relc_seq_cst_ret_cmpxchg: @@ -12437,13 +12306,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12583,13 +12451,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: @@ -12598,13 +12465,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-flat-workgroup.ll @@ -119,26 +119,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -256,26 +254,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -401,28 +397,26 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -557,31 +551,29 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -683,10 +675,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_unordered_store: @@ -695,10 +687,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -799,10 +791,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_store: @@ -811,10 +803,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -924,12 +916,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_release_store: @@ -938,11 +930,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -1052,12 +1044,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_store: @@ -1066,11 +1058,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -1171,10 +1163,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_atomicrmw: @@ -1183,10 +1175,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1299,8 +1291,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -1314,11 +1305,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1428,12 +1419,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_release_atomicrmw: @@ -1442,11 +1433,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1568,8 +1559,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1585,12 +1575,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1712,8 +1702,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -1729,12 +1718,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -1854,13 +1843,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_acquire_ret_atomicrmw: @@ -1869,12 +1858,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2004,8 +1993,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2013,6 +2001,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_ret_atomicrmw: @@ -2021,13 +2010,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2157,8 +2146,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2166,6 +2154,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_ret_atomicrmw: @@ -2174,13 +2163,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -2294,11 +2283,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_cmpxchg: @@ -2307,11 +2295,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2437,10 +2424,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2453,12 +2438,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2581,13 +2565,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_cmpxchg: @@ -2596,12 +2579,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2736,10 +2718,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2754,13 +2734,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -2895,10 +2874,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -2913,13 +2890,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3045,10 +3021,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3061,12 +3035,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3192,10 +3165,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3208,12 +3179,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3348,10 +3318,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3366,13 +3334,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3507,10 +3474,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3525,13 +3490,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3666,10 +3630,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3684,13 +3646,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3825,10 +3786,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -3843,13 +3802,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -3987,13 +3945,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_monotonic_ret_cmpxchg: @@ -4002,13 +3959,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4154,14 +4110,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_acquire_monotonic_ret_cmpxchg: @@ -4170,13 +4125,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4325,15 +4279,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_release_monotonic_ret_cmpxchg: @@ -4342,14 +4295,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4504,16 +4456,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_monotonic_ret_cmpxchg: @@ -4522,14 +4473,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4684,16 +4634,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_monotonic_ret_cmpxchg: @@ -4702,14 +4651,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -4855,14 +4803,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_acquire_ret_cmpxchg: @@ -4871,13 +4818,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5023,14 +4969,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_acquire_acquire_ret_cmpxchg: @@ -5039,13 +4984,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5200,16 +5144,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_release_acquire_ret_cmpxchg: @@ -5218,14 +5161,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5380,16 +5322,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_acquire_ret_cmpxchg: @@ -5398,14 +5339,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5560,16 +5500,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_acquire_ret_cmpxchg: @@ -5578,14 +5517,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5740,16 +5678,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_monotonic_seq_cst_ret_cmpxchg: @@ -5758,14 +5695,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -5920,16 +5856,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_acquire_seq_cst_ret_cmpxchg: @@ -5938,14 +5873,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6100,16 +6034,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_release_seq_cst_ret_cmpxchg: @@ -6118,14 +6051,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6280,16 +6212,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_acq_rel_seq_cst_ret_cmpxchg: @@ -6298,14 +6229,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6460,16 +6390,15 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_seq_cst_seq_cst_ret_cmpxchg: @@ -6478,14 +6407,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -6605,26 +6533,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -6742,26 +6668,24 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -6883,28 +6807,26 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -7030,30 +6952,28 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_load_b32 v2, v[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_load: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: flat_load_b32 v2, v[0:1] -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %in, i32* %out) { entry: @@ -7155,10 +7075,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_unordered_store: @@ -7167,10 +7087,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7271,10 +7191,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_store: @@ -7283,10 +7203,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7391,12 +7311,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_store: @@ -7405,10 +7325,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7513,12 +7433,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_store: @@ -7527,10 +7447,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32* %out) { entry: @@ -7631,10 +7551,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_atomicrmw: @@ -7643,10 +7563,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -7753,8 +7673,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -7767,10 +7686,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -7875,12 +7794,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_atomicrmw: @@ -7889,10 +7808,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8003,8 +7922,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8019,10 +7937,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8133,8 +8051,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8149,10 +8066,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8273,14 +8190,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_ret_atomicrmw: @@ -8289,12 +8206,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8420,8 +8337,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8430,6 +8346,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_ret_atomicrmw: @@ -8438,12 +8355,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8569,8 +8486,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -8579,6 +8495,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_ret_atomicrmw: @@ -8587,12 +8504,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_swap_b32 v2, v[0:1], v2 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in) { entry: @@ -8706,11 +8623,10 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_monotonic_cmpxchg: @@ -8719,11 +8635,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -8843,10 +8758,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -8858,11 +8771,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -8980,13 +8892,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_cmpxchg: @@ -8995,11 +8906,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9123,10 +9033,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9140,11 +9048,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9268,10 +9175,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9285,11 +9190,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9409,10 +9313,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9424,11 +9326,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9548,10 +9449,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9563,11 +9462,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9691,10 +9589,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9708,11 +9604,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9836,10 +9731,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9853,11 +9746,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -9981,10 +9873,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -9998,11 +9888,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10126,10 +10015,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10143,11 +10030,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10271,10 +10157,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10288,11 +10172,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10416,10 +10299,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10433,11 +10314,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10561,10 +10441,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10578,11 +10456,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10706,10 +10583,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 @@ -10723,11 +10598,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v[0:1], v[2:3] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -10865,13 +10739,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonicmonotonic_ret_cmpxchg: @@ -10880,13 +10753,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11030,15 +10902,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_monotonic_ret_cmpxchg: @@ -11047,13 +10918,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11197,15 +11067,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_monotonic_ret_cmpxchg: @@ -11214,13 +11083,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11368,10 +11236,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -11379,6 +11245,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -11387,13 +11254,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11541,10 +11407,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -11552,6 +11416,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -11560,13 +11425,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11710,15 +11574,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_acquire_ret_cmpxchg: @@ -11727,13 +11590,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -11877,15 +11739,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_acquire_ret_cmpxchg: @@ -11894,13 +11755,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12048,10 +11908,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12059,6 +11917,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_acquire_ret_cmpxchg: @@ -12067,13 +11926,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12221,10 +12079,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12232,6 +12088,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: @@ -12240,13 +12097,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12394,10 +12250,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12405,6 +12259,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: @@ -12413,13 +12268,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12567,10 +12421,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12578,6 +12430,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: @@ -12586,13 +12439,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12740,10 +12592,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12751,6 +12601,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: @@ -12759,13 +12610,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -12913,10 +12763,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -12924,6 +12772,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_release_seq_cst_ret_cmpxchg: @@ -12932,13 +12781,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13086,10 +12934,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13097,6 +12943,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: @@ -13105,13 +12952,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: @@ -13259,10 +13105,8 @@ ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc @@ -13270,6 +13114,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: flat_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: @@ -13278,13 +13123,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v3, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX11-CU-NEXT: v_mov_b32_e32 v2, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0 ; GFX11-CU-NEXT: flat_atomic_cmpswap_b32 v2, v[0:1], v[2:3] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: flat_store_b32 v[0:1], v2 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -123,6 +123,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_unordered_load: @@ -133,6 +134,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -253,6 +255,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_monotonic_load: @@ -263,6 +266,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -395,6 +399,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_acquire_load: @@ -407,6 +412,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -545,6 +551,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_seq_cst_load: @@ -558,6 +565,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -667,10 +675,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_unordered_store: @@ -678,10 +686,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -790,10 +798,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_monotonic_store: @@ -801,10 +809,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -926,12 +934,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_release_store: @@ -939,12 +947,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -1066,12 +1074,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_seq_cst_store: @@ -1079,12 +1087,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -1193,10 +1201,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_monotonic_atomicrmw: @@ -1204,10 +1212,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1335,9 +1343,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1349,9 +1356,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -1477,12 +1483,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_release_atomicrmw: @@ -1490,12 +1496,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1636,9 +1642,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1652,9 +1657,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1801,9 +1805,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1817,9 +1820,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1962,14 +1964,14 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_acquire_ret_atomicrmw: @@ -1977,14 +1979,14 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2135,9 +2137,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2145,6 +2146,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_acq_rel_ret_atomicrmw: @@ -2152,9 +2154,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2162,6 +2163,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2312,9 +2314,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2322,6 +2323,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_seq_cst_ret_atomicrmw: @@ -2329,9 +2331,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2339,6 +2340,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2457,9 +2459,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: @@ -2469,9 +2471,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2609,8 +2611,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2624,8 +2625,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2761,11 +2761,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_release_monotonic_cmpxchg: @@ -2775,11 +2775,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2930,8 +2930,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2947,8 +2946,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3105,8 +3103,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3122,8 +3119,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3267,8 +3263,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3282,8 +3277,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3425,8 +3419,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3440,8 +3433,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3596,8 +3588,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3613,8 +3604,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3771,8 +3761,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3788,8 +3777,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3946,8 +3934,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3963,8 +3950,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4121,8 +4107,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4138,8 +4123,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4296,8 +4280,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4313,8 +4296,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4471,8 +4453,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4488,8 +4469,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4646,8 +4626,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4663,8 +4642,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4821,8 +4799,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4838,8 +4815,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4984,11 +4960,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_monotonic_monotonic_ret_cmpxchg: @@ -4998,11 +4974,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5153,13 +5129,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: @@ -5169,13 +5145,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5329,13 +5305,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_release_monotonic_ret_cmpxchg: @@ -5345,13 +5321,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5515,8 +5491,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5524,6 +5499,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: @@ -5533,8 +5509,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5542,6 +5517,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5705,8 +5681,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5714,6 +5689,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: @@ -5723,8 +5699,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5732,6 +5707,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5882,13 +5858,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_monotonic_acquire_ret_cmpxchg: @@ -5898,13 +5874,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6055,13 +6031,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: @@ -6071,13 +6047,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6241,8 +6217,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6250,6 +6225,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: @@ -6259,8 +6235,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6268,6 +6243,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6431,8 +6407,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6440,6 +6415,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: @@ -6449,8 +6425,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6458,6 +6433,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6621,8 +6597,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6630,6 +6605,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: @@ -6639,8 +6615,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6648,6 +6623,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6811,8 +6787,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6820,6 +6795,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_monotonic_seq_cst_ret_cmpxchg: @@ -6829,8 +6805,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6838,6 +6813,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7001,8 +6977,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7010,6 +6985,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_acquire_seq_cst_ret_cmpxchg: @@ -7019,8 +6995,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7028,6 +7003,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7191,8 +7167,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7200,6 +7175,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_release_seq_cst_ret_cmpxchg: @@ -7209,8 +7185,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7218,6 +7193,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7381,8 +7357,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7390,6 +7365,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_acq_rel_seq_cst_ret_cmpxchg: @@ -7399,8 +7375,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7408,6 +7383,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7571,8 +7547,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7580,6 +7555,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: @@ -7589,8 +7565,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -7598,6 +7573,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7720,6 +7696,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_unordered_load: @@ -7730,6 +7707,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7850,6 +7828,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_load: @@ -7860,6 +7839,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7992,6 +7972,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_load: @@ -8004,6 +7985,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -8142,6 +8124,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_load: @@ -8155,6 +8138,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -8264,10 +8248,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_unordered_store: @@ -8275,10 +8259,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -8387,10 +8371,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_store: @@ -8398,10 +8382,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -8523,12 +8507,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_release_store: @@ -8536,12 +8520,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -8663,12 +8647,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_store: @@ -8676,12 +8660,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -8790,10 +8774,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_atomicrmw: @@ -8801,10 +8785,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8932,9 +8916,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -8946,9 +8929,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -9074,12 +9056,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_release_atomicrmw: @@ -9087,12 +9069,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -9233,9 +9215,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -9249,9 +9230,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -9398,9 +9378,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -9414,9 +9393,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -9559,14 +9537,14 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_ret_atomicrmw: @@ -9574,14 +9552,14 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -9732,9 +9710,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9742,6 +9719,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_ret_atomicrmw: @@ -9749,9 +9727,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9759,6 +9736,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -9909,9 +9887,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9919,6 +9896,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_ret_atomicrmw: @@ -9926,9 +9904,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9936,6 +9913,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -10054,9 +10032,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: @@ -10066,9 +10044,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10206,8 +10184,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10221,8 +10198,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10358,11 +10334,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: @@ -10372,11 +10348,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10527,8 +10503,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10544,8 +10519,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10702,8 +10676,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10719,8 +10692,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10864,8 +10836,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10879,8 +10850,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11022,8 +10992,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -11037,8 +11006,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -11193,8 +11161,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11210,8 +11177,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11368,8 +11334,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11385,8 +11350,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11543,8 +11507,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11560,8 +11523,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11718,8 +11680,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11735,8 +11696,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11893,8 +11853,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11910,8 +11869,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12068,8 +12026,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12085,8 +12042,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12243,8 +12199,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12260,8 +12215,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12418,8 +12372,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12435,8 +12388,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -12581,11 +12533,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_monotonic_ret_cmpxchg: @@ -12595,11 +12547,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12750,13 +12702,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: @@ -12766,13 +12718,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12936,8 +12888,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12945,6 +12896,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -12954,8 +12906,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12963,6 +12914,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13126,8 +13078,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13135,6 +13086,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -13144,8 +13096,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13153,6 +13104,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13303,13 +13255,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_acquire_ret_cmpxchg: @@ -13319,13 +13271,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13476,13 +13428,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: @@ -13492,13 +13444,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13662,8 +13614,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13671,6 +13622,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: @@ -13680,8 +13632,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13689,6 +13640,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13852,8 +13804,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13861,6 +13812,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: @@ -13870,8 +13822,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13879,6 +13830,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -14042,8 +13994,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14051,6 +14002,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: @@ -14060,8 +14012,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14069,6 +14020,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -14232,8 +14184,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14241,6 +14192,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_monotonic_seq_cst_ret_cmpxchg: @@ -14250,8 +14202,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14259,6 +14210,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -14422,8 +14374,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14431,6 +14382,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_acquire_seq_cst_ret_cmpxchg: @@ -14440,8 +14392,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14449,6 +14400,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -14612,8 +14564,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14621,6 +14572,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_release_seq_cst_ret_cmpxchg: @@ -14630,8 +14582,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14639,6 +14590,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -14802,8 +14754,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14811,6 +14762,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_acq_rel_seq_cst_ret_cmpxchg: @@ -14820,8 +14772,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14829,6 +14780,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -14992,8 +14944,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -15001,6 +14952,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: @@ -15010,8 +14962,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -15019,6 +14970,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-nontemporal.ll @@ -121,23 +121,23 @@ ; GFX11-WGP-LABEL: global_nontemporal_load_0: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_nontemporal_load_0: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -265,23 +265,23 @@ ; GFX11-WGP-LABEL: global_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -402,23 +402,23 @@ ; GFX11-WGP-LABEL: global_nontemporal_store_0: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_nontemporal_store_0: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -541,23 +541,23 @@ ; GFX11-WGP-LABEL: global_nontemporal_store_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_nontemporal_store_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] glc slc dlc +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -123,6 +123,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_unordered_load: @@ -133,6 +134,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -253,6 +255,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_load: @@ -263,6 +266,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -383,6 +387,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_load: @@ -393,6 +398,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -513,6 +519,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_load: @@ -523,6 +530,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -632,10 +640,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_unordered_store: @@ -643,10 +651,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -755,10 +763,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_store: @@ -766,10 +774,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -878,10 +886,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_store: @@ -889,10 +897,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -1001,10 +1009,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_store: @@ -1012,10 +1020,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -1124,10 +1132,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_atomicrmw: @@ -1135,10 +1143,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1247,10 +1255,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_atomicrmw: @@ -1258,10 +1266,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1370,10 +1378,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_atomicrmw: @@ -1381,10 +1389,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1493,10 +1501,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_atomicrmw: @@ -1504,10 +1512,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1616,10 +1624,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_atomicrmw: @@ -1627,10 +1635,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1757,12 +1765,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_ret_atomicrmw: @@ -1770,12 +1778,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1903,12 +1911,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_ret_atomicrmw: @@ -1916,12 +1924,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2049,12 +2057,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_ret_atomicrmw: @@ -2062,12 +2070,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2186,9 +2194,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: @@ -2198,9 +2206,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2319,9 +2327,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -2331,9 +2339,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2452,9 +2460,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: @@ -2464,9 +2472,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2585,9 +2593,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -2597,9 +2605,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2718,9 +2726,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -2730,9 +2738,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2851,9 +2859,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_cmpxchg: @@ -2863,9 +2871,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2984,9 +2992,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -2996,9 +3004,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3117,9 +3125,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -3129,9 +3137,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3250,9 +3258,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -3262,9 +3270,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3383,9 +3391,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -3395,9 +3403,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3516,9 +3524,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_cmpxchg: @@ -3528,9 +3536,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3649,9 +3657,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_cmpxchg: @@ -3661,9 +3669,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3782,9 +3790,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_cmpxchg: @@ -3794,9 +3802,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3915,9 +3923,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_cmpxchg: @@ -3927,9 +3935,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4048,9 +4056,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -4060,9 +4068,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4201,11 +4209,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_monotonic_ret_cmpxchg: @@ -4215,11 +4223,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4360,11 +4368,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: @@ -4374,11 +4382,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4519,11 +4527,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_monotonic_ret_cmpxchg: @@ -4533,11 +4541,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4678,11 +4686,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: @@ -4692,11 +4700,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4837,11 +4845,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: @@ -4851,11 +4859,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4996,11 +5004,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_acquire_ret_cmpxchg: @@ -5010,11 +5018,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5155,11 +5163,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: @@ -5169,11 +5177,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5314,11 +5322,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: @@ -5328,11 +5336,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5473,11 +5481,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: @@ -5487,11 +5495,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5632,11 +5640,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: @@ -5646,11 +5654,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5791,11 +5799,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_monotonic_seq_cst_ret_cmpxchg: @@ -5805,11 +5813,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5950,11 +5958,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acquire_seq_cst_ret_cmpxchg: @@ -5964,11 +5972,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6109,11 +6117,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_release_seq_cst_ret_cmpxchg: @@ -6123,11 +6131,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6268,11 +6276,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_acq_rel_seq_cst_ret_cmpxchg: @@ -6282,11 +6290,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6427,11 +6435,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: @@ -6441,11 +6449,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6568,6 +6576,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_unordered_load: @@ -6578,6 +6587,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -6698,6 +6708,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_load: @@ -6708,6 +6719,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -6828,6 +6840,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_load: @@ -6838,6 +6851,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -6958,6 +6972,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_load: @@ -6968,6 +6983,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7077,10 +7093,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_unordered_store: @@ -7088,10 +7104,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7200,10 +7216,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_store: @@ -7211,10 +7227,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7323,10 +7339,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_store: @@ -7334,10 +7350,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7446,10 +7462,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_store: @@ -7457,10 +7473,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7569,10 +7585,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_atomicrmw: @@ -7580,10 +7596,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -7692,10 +7708,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_atomicrmw: @@ -7703,10 +7719,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -7815,10 +7831,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_atomicrmw: @@ -7826,10 +7842,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -7938,10 +7954,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_atomicrmw: @@ -7949,10 +7965,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8061,10 +8077,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_atomicrmw: @@ -8072,10 +8088,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8202,12 +8218,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_ret_atomicrmw: @@ -8215,12 +8231,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8348,12 +8364,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_ret_atomicrmw: @@ -8361,12 +8377,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8494,12 +8510,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_ret_atomicrmw: @@ -8507,12 +8523,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8631,9 +8647,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: @@ -8643,9 +8659,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8764,9 +8780,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -8776,9 +8792,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8897,9 +8913,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: @@ -8909,9 +8925,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9030,9 +9046,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -9042,9 +9058,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9163,9 +9179,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -9175,9 +9191,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9296,9 +9312,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_cmpxchg: @@ -9308,9 +9324,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9429,9 +9445,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -9441,9 +9457,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9562,9 +9578,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -9574,9 +9590,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9695,9 +9711,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -9707,9 +9723,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9828,9 +9844,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -9840,9 +9856,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9961,9 +9977,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_cmpxchg: @@ -9973,9 +9989,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10094,9 +10110,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_cmpxchg: @@ -10106,9 +10122,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10227,9 +10243,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_cmpxchg: @@ -10239,9 +10255,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10360,9 +10376,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_cmpxchg: @@ -10372,9 +10388,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10493,9 +10509,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -10505,9 +10521,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10646,11 +10662,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_ret_cmpxchg: @@ -10660,11 +10676,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10805,11 +10821,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: @@ -10819,11 +10835,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10964,11 +10980,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_monotonic_ret_cmpxchg: @@ -10978,11 +10994,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11123,11 +11139,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -11137,11 +11153,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11282,11 +11298,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -11296,11 +11312,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11441,11 +11457,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_acquire_ret_cmpxchg: @@ -11455,11 +11471,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11600,11 +11616,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: @@ -11614,11 +11630,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11759,11 +11775,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: @@ -11773,11 +11789,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11918,11 +11934,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: @@ -11932,11 +11948,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12077,11 +12093,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: @@ -12091,11 +12107,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12236,11 +12252,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_monotonic_seq_cst_ret_cmpxchg: @@ -12250,11 +12266,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12395,11 +12411,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acquire_seq_cst_ret_cmpxchg: @@ -12409,11 +12425,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12554,11 +12570,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_release_seq_cst_ret_cmpxchg: @@ -12568,11 +12584,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12713,11 +12729,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_acq_rel_seq_cst_ret_cmpxchg: @@ -12727,11 +12743,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12872,11 +12888,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: @@ -12886,11 +12902,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -123,6 +123,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_unordered_load: @@ -133,6 +134,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -253,6 +255,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_monotonic_load: @@ -263,6 +266,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -397,6 +401,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acquire_load: @@ -409,6 +414,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -549,6 +555,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_seq_cst_load: @@ -562,6 +569,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -671,10 +679,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_unordered_store: @@ -682,10 +690,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -794,10 +802,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_monotonic_store: @@ -805,10 +813,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -932,12 +940,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_release_store: @@ -945,12 +953,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -1074,12 +1082,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_seq_cst_store: @@ -1087,12 +1095,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -1201,10 +1209,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_monotonic_atomicrmw: @@ -1212,10 +1220,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1345,9 +1353,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1359,9 +1366,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -1489,12 +1495,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_release_atomicrmw: @@ -1502,12 +1508,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1652,9 +1658,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1668,9 +1673,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1821,9 +1825,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1837,9 +1840,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1984,14 +1986,14 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acquire_ret_atomicrmw: @@ -1999,14 +2001,14 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2161,9 +2163,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2171,6 +2172,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acq_rel_ret_atomicrmw: @@ -2178,9 +2180,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2188,6 +2189,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2342,9 +2344,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2352,6 +2353,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_seq_cst_ret_atomicrmw: @@ -2359,9 +2361,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -2369,6 +2370,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2487,9 +2489,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: @@ -2499,9 +2501,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2641,8 +2643,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2656,8 +2657,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -2795,11 +2795,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_release_monotonic_cmpxchg: @@ -2809,11 +2809,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2968,8 +2968,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2985,8 +2984,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3147,8 +3145,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3164,8 +3161,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3311,8 +3307,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3326,8 +3321,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3471,8 +3465,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3486,8 +3479,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -3646,8 +3638,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3663,8 +3654,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3825,8 +3815,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3842,8 +3831,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4004,8 +3992,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4021,8 +4008,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4183,8 +4169,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4200,8 +4185,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4346,11 +4330,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_monotonic_monotonic_ret_cmpxchg: @@ -4360,11 +4344,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4517,13 +4501,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: @@ -4533,13 +4517,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4707,8 +4691,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -4716,6 +4699,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: @@ -4725,8 +4709,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -4734,6 +4717,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4901,8 +4885,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -4910,6 +4893,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: @@ -4919,8 +4903,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -4928,6 +4911,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5080,13 +5064,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_monotonic_acquire_ret_cmpxchg: @@ -5096,13 +5080,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5255,13 +5239,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: @@ -5271,13 +5255,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5445,8 +5429,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5454,6 +5437,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_release_acquire_ret_cmpxchg: @@ -5463,8 +5447,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5472,6 +5455,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5639,8 +5623,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5648,6 +5631,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: @@ -5657,8 +5641,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5666,6 +5649,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5833,8 +5817,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5842,6 +5825,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: @@ -5851,8 +5835,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -5860,6 +5843,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6027,8 +6011,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6036,6 +6019,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_monotonic_seq_cst_ret_cmpxchg: @@ -6045,8 +6029,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6054,6 +6037,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6221,8 +6205,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6230,6 +6213,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acquire_seq_cst_ret_cmpxchg: @@ -6239,8 +6223,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6248,6 +6231,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6415,8 +6399,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6424,6 +6407,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_relese_seq_cst_ret_cmpxchg: @@ -6433,8 +6417,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6442,6 +6425,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6609,8 +6593,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6618,6 +6601,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_acq_rel_seq_cst_ret_cmpxchg: @@ -6627,8 +6611,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6636,6 +6619,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6803,8 +6787,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6812,6 +6795,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: @@ -6821,8 +6805,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -6830,6 +6813,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6952,6 +6936,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_unordered_load: @@ -6962,6 +6947,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7082,6 +7068,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_load: @@ -7092,6 +7079,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7226,6 +7214,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acquire_load: @@ -7238,6 +7227,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7378,6 +7368,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_load: @@ -7391,6 +7382,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7500,10 +7492,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_unordered_store: @@ -7511,10 +7503,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7623,10 +7615,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_store: @@ -7634,10 +7626,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7761,12 +7753,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_release_store: @@ -7774,12 +7766,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7903,12 +7895,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_store: @@ -7916,12 +7908,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -8030,10 +8022,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_atomicrmw: @@ -8041,10 +8033,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8174,9 +8166,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -8188,9 +8179,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -8318,12 +8308,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_release_atomicrmw: @@ -8331,12 +8321,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8481,9 +8471,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8497,9 +8486,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8650,9 +8638,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8666,9 +8653,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8813,14 +8799,14 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acquire_ret_atomicrmw: @@ -8828,14 +8814,14 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8990,9 +8976,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9000,6 +8985,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_ret_atomicrmw: @@ -9007,9 +8993,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9017,6 +9002,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -9171,9 +9157,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9181,6 +9166,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_ret_atomicrmw: @@ -9188,9 +9174,8 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc @@ -9198,6 +9183,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -9316,9 +9302,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: @@ -9328,9 +9314,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9470,8 +9456,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9485,8 +9470,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -9624,11 +9608,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: @@ -9638,11 +9622,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9797,8 +9781,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9814,8 +9797,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9976,8 +9958,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9993,8 +9974,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10140,8 +10120,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10155,8 +10134,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10300,8 +10278,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10315,8 +10292,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: buffer_gl0_inv @@ -10475,8 +10451,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10492,8 +10467,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10654,8 +10628,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10671,8 +10644,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10833,8 +10805,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10850,8 +10821,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11012,8 +10982,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11029,8 +10998,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11191,8 +11159,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11208,8 +11175,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11370,8 +11336,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11387,8 +11352,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11549,8 +11513,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11566,8 +11529,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11728,8 +11690,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11745,8 +11706,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11891,11 +11851,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_monotonic_ret_cmpxchg: @@ -11905,11 +11865,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12062,13 +12022,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: @@ -12078,13 +12038,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12240,13 +12200,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_release_monotonic_ret_cmpxchg: @@ -12256,13 +12216,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12430,8 +12390,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12439,6 +12398,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -12448,8 +12408,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12457,6 +12416,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12624,8 +12584,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12633,6 +12592,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -12642,8 +12602,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -12651,6 +12610,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12803,13 +12763,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_acquire_ret_cmpxchg: @@ -12819,13 +12779,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12978,13 +12938,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: @@ -12994,13 +12954,13 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13168,8 +13128,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13177,6 +13136,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: @@ -13186,8 +13146,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13195,6 +13154,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13362,8 +13322,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13371,6 +13330,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: @@ -13380,8 +13340,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13389,6 +13348,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13556,8 +13516,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13565,6 +13524,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: @@ -13574,8 +13534,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13583,6 +13542,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13750,8 +13710,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13759,6 +13718,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_monotonic_seq_cst_ret_cmpxchg: @@ -13768,8 +13728,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13777,6 +13736,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13944,8 +13904,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13953,6 +13912,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acquire_seq_cst_ret_cmpxchg: @@ -13962,8 +13922,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -13971,6 +13930,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -14138,8 +14098,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14147,6 +14106,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_release_seq_cst_ret_cmpxchg: @@ -14156,8 +14116,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14165,6 +14124,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -14332,8 +14292,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14341,6 +14300,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_acq_rel_seq_cst_ret_cmpxchg: @@ -14350,8 +14310,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14359,6 +14318,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -14526,8 +14486,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14535,6 +14494,7 @@ ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: buffer_gl1_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: @@ -14544,8 +14504,7 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc @@ -14553,6 +14512,7 @@ ; GFX11-CU-NEXT: buffer_gl0_inv ; GFX11-CU-NEXT: buffer_gl1_inv ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-volatile.ll @@ -79,6 +79,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_volatile_load_0: @@ -89,6 +90,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -172,23 +174,23 @@ ; GFX11-WGP-LABEL: global_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_load_b32 v0, v0, s[0:1] glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -270,25 +272,25 @@ ; GFX11-WGP-LABEL: global_volatile_store_0: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_volatile_store_0: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -372,25 +374,25 @@ ; GFX11-WGP-LABEL: global_volatile_store_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_volatile_store_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -475,6 +477,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_volatile_workgroup_acquire_load: @@ -485,6 +488,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -560,12 +564,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_volatile_workgroup_release_store: @@ -573,11 +577,11 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -123,6 +123,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_unordered_load: @@ -133,6 +134,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -253,6 +255,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_load: @@ -263,6 +266,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -383,6 +387,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_load: @@ -393,6 +398,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -513,6 +519,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_load: @@ -523,6 +530,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -632,10 +640,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_unordered_store: @@ -643,10 +651,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -755,10 +763,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_store: @@ -766,10 +774,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -878,10 +886,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_store: @@ -889,10 +897,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -1001,10 +1009,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_store: @@ -1012,10 +1020,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -1124,10 +1132,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_atomicrmw: @@ -1135,10 +1143,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1247,10 +1255,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_atomicrmw: @@ -1258,10 +1266,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1370,10 +1378,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_atomicrmw: @@ -1381,10 +1389,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1493,10 +1501,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_atomicrmw: @@ -1504,10 +1512,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1616,10 +1624,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_atomicrmw: @@ -1627,10 +1635,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1757,12 +1765,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_ret_atomicrmw: @@ -1770,12 +1778,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1903,12 +1911,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_ret_atomicrmw: @@ -1916,12 +1924,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2049,12 +2057,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_ret_atomicrmw: @@ -2062,12 +2070,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2186,9 +2194,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: @@ -2198,9 +2206,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2319,9 +2327,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -2331,9 +2339,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2452,9 +2460,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: @@ -2464,9 +2472,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2585,9 +2593,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -2597,9 +2605,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2718,9 +2726,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -2730,9 +2738,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2851,9 +2859,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_cmpxchg: @@ -2863,9 +2871,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2984,9 +2992,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -2996,9 +3004,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3117,9 +3125,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -3129,9 +3137,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3250,9 +3258,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -3262,9 +3270,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3383,9 +3391,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -3395,9 +3403,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3516,9 +3524,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_cmpxchg: @@ -3528,9 +3536,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3649,9 +3657,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_cmpxchg: @@ -3661,9 +3669,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3782,9 +3790,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_cmpxchg: @@ -3794,9 +3802,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3915,9 +3923,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_cmpxchg: @@ -3927,9 +3935,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4048,9 +4056,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -4060,9 +4068,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4201,11 +4209,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_monotonic_ret_cmpxchg: @@ -4215,11 +4223,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4360,11 +4368,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: @@ -4374,11 +4382,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4519,11 +4527,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_monotonic_ret_cmpxchg: @@ -4533,11 +4541,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4678,11 +4686,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: @@ -4692,11 +4700,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4837,11 +4845,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: @@ -4851,11 +4859,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4996,11 +5004,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_acquire_ret_cmpxchg: @@ -5010,11 +5018,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5155,11 +5163,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: @@ -5169,11 +5177,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5314,11 +5322,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: @@ -5328,11 +5336,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5473,11 +5481,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: @@ -5487,11 +5495,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5632,11 +5640,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: @@ -5646,11 +5654,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5791,11 +5799,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_monotonic_seq_cst_ret_cmpxchg: @@ -5805,11 +5813,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5950,11 +5958,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acquire_seq_cst_ret_cmpxchg: @@ -5964,11 +5972,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6109,11 +6117,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_release_seq_cst_ret_cmpxchg: @@ -6123,11 +6131,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6268,11 +6276,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_acq_rel_seq_cst_ret_cmpxchg: @@ -6282,11 +6290,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6427,11 +6435,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: @@ -6441,11 +6449,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6568,6 +6576,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_unordered_load: @@ -6578,6 +6587,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -6698,6 +6708,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_load: @@ -6708,6 +6719,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -6828,6 +6840,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_load: @@ -6838,6 +6851,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -6958,6 +6972,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_load: @@ -6968,6 +6983,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7077,10 +7093,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_unordered_store: @@ -7088,10 +7104,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7200,10 +7216,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_store: @@ -7211,10 +7227,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7323,10 +7339,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_store: @@ -7334,10 +7350,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7446,10 +7462,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_store: @@ -7457,10 +7473,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7569,10 +7585,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_atomicrmw: @@ -7580,10 +7596,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -7692,10 +7708,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_atomicrmw: @@ -7703,10 +7719,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -7815,10 +7831,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_atomicrmw: @@ -7826,10 +7842,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -7938,10 +7954,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_atomicrmw: @@ -7949,10 +7965,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8061,10 +8077,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_atomicrmw: @@ -8072,10 +8088,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8202,12 +8218,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_ret_atomicrmw: @@ -8215,12 +8231,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8348,12 +8364,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_ret_atomicrmw: @@ -8361,12 +8377,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8494,12 +8510,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_ret_atomicrmw: @@ -8507,12 +8523,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8631,9 +8647,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: @@ -8643,9 +8659,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8764,9 +8780,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -8776,9 +8792,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -8897,9 +8913,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: @@ -8909,9 +8925,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9030,9 +9046,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -9042,9 +9058,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9163,9 +9179,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -9175,9 +9191,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9296,9 +9312,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_cmpxchg: @@ -9308,9 +9324,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9429,9 +9445,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -9441,9 +9457,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9562,9 +9578,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -9574,9 +9590,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9695,9 +9711,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -9707,9 +9723,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9828,9 +9844,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -9840,9 +9856,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9961,9 +9977,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_cmpxchg: @@ -9973,9 +9989,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10094,9 +10110,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_cmpxchg: @@ -10106,9 +10122,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10227,9 +10243,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_cmpxchg: @@ -10239,9 +10255,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10360,9 +10376,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_cmpxchg: @@ -10372,9 +10388,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10493,9 +10509,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -10505,9 +10521,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10646,11 +10662,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_ret_cmpxchg: @@ -10660,11 +10676,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10805,11 +10821,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: @@ -10819,11 +10835,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10964,11 +10980,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_monotonic_ret_cmpxchg: @@ -10978,11 +10994,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11123,11 +11139,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -11137,11 +11153,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11282,11 +11298,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -11296,11 +11312,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11441,11 +11457,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_acquire_ret_cmpxchg: @@ -11455,11 +11471,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11600,11 +11616,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: @@ -11614,11 +11630,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11759,11 +11775,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: @@ -11773,11 +11789,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11918,11 +11934,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: @@ -11932,11 +11948,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12077,11 +12093,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: @@ -12091,11 +12107,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12236,11 +12252,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_monotonic_seq_cst_ret_cmpxchg: @@ -12250,11 +12266,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12395,11 +12411,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acquire_seq_cst_ret_cmpxchg: @@ -12409,11 +12425,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12554,11 +12570,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_release_seq_cst_ret_cmpxchg: @@ -12568,11 +12584,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12713,11 +12729,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_acq_rel_seq_cst_ret_cmpxchg: @@ -12727,11 +12743,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12872,11 +12888,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: @@ -12886,11 +12902,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -123,6 +123,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_unordered_load: @@ -133,6 +134,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -253,6 +255,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_monotonic_load: @@ -263,6 +266,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -387,6 +391,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_acquire_load: @@ -397,6 +402,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -526,6 +532,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_load: @@ -536,6 +543,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -645,10 +653,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_unordered_store: @@ -656,10 +664,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -768,10 +776,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_monotonic_store: @@ -779,10 +787,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -901,12 +909,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_release_store: @@ -914,11 +922,11 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -1037,12 +1045,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_store: @@ -1050,11 +1058,11 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -1163,10 +1171,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_monotonic_atomicrmw: @@ -1174,10 +1182,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1292,9 +1300,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1305,10 +1312,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1427,12 +1434,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_release_atomicrmw: @@ -1440,11 +1447,11 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1569,9 +1576,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1584,11 +1590,11 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1713,9 +1719,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -1728,11 +1733,11 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -1862,13 +1867,13 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_acquire_ret_atomicrmw: @@ -1876,12 +1881,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2022,15 +2027,15 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_ret_atomicrmw: @@ -2038,13 +2043,13 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2185,15 +2190,15 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_ret_atomicrmw: @@ -2201,13 +2206,13 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -2326,9 +2331,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: @@ -2338,9 +2343,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2465,8 +2470,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -2479,9 +2483,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2610,11 +2614,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: @@ -2624,10 +2628,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2762,8 +2766,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2778,10 +2781,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -2916,8 +2919,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -2932,10 +2934,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3060,8 +3062,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3074,9 +3075,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3201,8 +3202,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -3215,9 +3215,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3352,8 +3352,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3368,10 +3367,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3506,8 +3505,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3522,10 +3520,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3660,8 +3658,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3676,10 +3673,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3814,8 +3811,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3830,10 +3826,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -3968,8 +3964,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -3984,10 +3979,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4122,8 +4117,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4138,10 +4132,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4276,8 +4270,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4292,10 +4285,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4430,8 +4423,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -4446,10 +4438,10 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4588,11 +4580,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_monotonic_monotonic_ret_cmpxchg: @@ -4602,11 +4594,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4750,12 +4742,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: @@ -4765,11 +4757,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -4920,13 +4912,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_release_monotonic_ret_cmpxchg: @@ -4936,12 +4928,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5095,14 +5087,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: @@ -5112,12 +5104,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5271,14 +5263,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: @@ -5288,12 +5280,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5437,12 +5429,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_monotonic_acquire_ret_cmpxchg: @@ -5452,11 +5444,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5600,12 +5592,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: @@ -5615,11 +5607,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5773,14 +5765,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: @@ -5790,12 +5782,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -5949,14 +5941,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: @@ -5966,12 +5958,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6125,14 +6117,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: @@ -6142,12 +6134,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6301,14 +6293,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_monotonic_seq_cst_ret_cmpxchg: @@ -6318,12 +6310,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6477,14 +6469,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_acquire_seq_cst_ret_cmpxchg: @@ -6494,12 +6486,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6653,14 +6645,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_release_seq_cst_ret_cmpxchg: @@ -6670,12 +6662,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -6829,14 +6821,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_acq_rel_seq_cst_ret_cmpxchg: @@ -6846,12 +6838,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7005,14 +6997,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: @@ -7022,12 +7014,12 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -7150,6 +7142,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_unordered_load: @@ -7160,6 +7153,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7280,6 +7274,7 @@ ; GFX11-WGP-NEXT: global_load_b32 v1, v0, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_load: @@ -7290,6 +7285,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7414,6 +7410,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_load: @@ -7424,6 +7421,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7550,6 +7548,7 @@ ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_load: @@ -7560,6 +7559,7 @@ ; GFX11-CU-NEXT: global_load_b32 v1, v0, s[0:1] ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[2:3] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(1)* %out) { entry: @@ -7669,10 +7669,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_unordered_store: @@ -7680,10 +7680,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7792,10 +7792,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_store: @@ -7803,10 +7803,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -7919,12 +7919,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_store: @@ -7932,10 +7932,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -8048,12 +8048,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_store: @@ -8061,10 +8061,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(1)* %out) { entry: @@ -8173,10 +8173,10 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_atomicrmw: @@ -8184,10 +8184,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8302,9 +8302,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -8315,10 +8314,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8431,12 +8430,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_atomicrmw: @@ -8444,10 +8443,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8566,9 +8565,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8581,10 +8579,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8703,9 +8701,8 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] @@ -8718,10 +8715,10 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -8851,13 +8848,13 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_ret_atomicrmw: @@ -8865,12 +8862,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -9005,15 +9002,15 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_ret_atomicrmw: @@ -9021,12 +9018,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -9161,15 +9158,15 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_ret_atomicrmw: @@ -9177,12 +9174,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x8 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, s2 ; GFX11-CU-NEXT: global_atomic_swap_b32 v1, v0, v1, s[0:1] glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v0, v1, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in) { entry: @@ -9301,9 +9298,9 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: @@ -9313,9 +9310,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9440,8 +9437,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -9454,9 +9450,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9579,11 +9575,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: @@ -9593,9 +9589,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9724,8 +9720,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9740,9 +9735,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -9871,8 +9866,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -9887,9 +9881,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10014,8 +10008,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10028,9 +10021,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10155,8 +10148,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -10169,9 +10161,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10300,8 +10292,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10316,9 +10307,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10447,8 +10438,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10463,9 +10453,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10594,8 +10584,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10610,9 +10599,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10741,8 +10730,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10757,9 +10745,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -10888,8 +10876,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -10904,9 +10891,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11035,8 +11022,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11051,9 +11037,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11182,8 +11168,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11198,9 +11183,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11329,8 +11314,7 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 @@ -11345,9 +11329,9 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v2, v[0:1], s[0:1] offset:16 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11486,11 +11470,11 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_ret_cmpxchg: @@ -11500,11 +11484,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11648,12 +11632,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: @@ -11663,11 +11647,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11812,13 +11796,13 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_monotonic_ret_cmpxchg: @@ -11828,11 +11812,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -11980,14 +11964,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -11997,11 +11981,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12149,14 +12133,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -12166,11 +12150,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12314,12 +12298,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_acquire_ret_cmpxchg: @@ -12329,11 +12313,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12477,12 +12461,12 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: @@ -12492,11 +12476,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12644,14 +12628,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: @@ -12661,11 +12645,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12813,14 +12797,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: @@ -12830,11 +12814,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -12982,14 +12966,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: @@ -12999,11 +12983,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13151,14 +13135,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_monotonic_seq_cst_ret_cmpxchg: @@ -13168,11 +13152,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13320,14 +13304,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_acquire_seq_cst_ret_cmpxchg: @@ -13337,11 +13321,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13489,14 +13473,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_release_seq_cst_ret_cmpxchg: @@ -13506,11 +13490,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13658,14 +13642,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_acq_rel_seq_cst_ret_cmpxchg: @@ -13675,11 +13659,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: @@ -13827,14 +13811,14 @@ ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv ; GFX11-WGP-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: @@ -13844,11 +13828,11 @@ ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3 ; GFX11-CU-NEXT: global_atomic_cmpswap_b32 v0, v2, v[0:1], s[0:1] offset:16 glc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v2, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %out, i32 %in, i32 %old) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-agent.ll @@ -119,8 +119,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -130,8 +129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -251,8 +249,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -262,8 +259,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -386,8 +382,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -398,8 +393,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -532,8 +526,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 @@ -546,8 +539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -650,8 +642,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -659,8 +650,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -759,8 +749,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -768,8 +757,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -878,8 +866,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -889,8 +876,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1000,8 +986,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1011,8 +996,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1112,8 +1096,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1121,8 +1104,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1231,8 +1213,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1242,8 +1223,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -1353,8 +1333,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1364,8 +1343,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1485,8 +1463,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1498,8 +1475,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1620,8 +1596,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1633,8 +1608,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1756,8 +1730,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1768,8 +1741,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1902,8 +1874,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -1916,8 +1887,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2051,8 +2021,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -2065,8 +2034,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2178,8 +2146,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2188,8 +2155,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2309,8 +2275,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2321,8 +2286,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,8 +2407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2455,8 +2418,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2587,8 +2549,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2601,8 +2562,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2734,8 +2694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2748,8 +2707,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2871,8 +2829,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2883,8 +2840,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3005,8 +2961,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3017,8 +2972,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3149,8 +3103,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3163,8 +3116,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3296,8 +3248,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3310,8 +3261,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3443,8 +3393,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3457,8 +3406,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3590,8 +3538,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3604,8 +3551,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3737,8 +3683,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3751,8 +3696,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3884,8 +3828,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3898,8 +3841,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4031,8 +3973,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4045,8 +3986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4178,8 +4118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4192,8 +4131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4323,8 +4261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4335,8 +4272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4471,8 +4407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4484,8 +4419,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4627,8 +4561,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4641,8 +4574,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4788,8 +4720,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4803,8 +4734,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4950,8 +4880,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4965,8 +4894,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5102,8 +5030,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5115,8 +5042,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5251,8 +5177,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5264,8 +5189,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5410,8 +5334,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5425,8 +5348,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5572,8 +5494,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5587,8 +5508,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5734,8 +5654,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5749,8 +5668,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5896,8 +5814,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5911,8 +5828,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6058,8 +5974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6073,8 +5988,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6220,8 +6134,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6235,8 +6148,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6382,8 +6294,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6397,8 +6308,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6544,8 +6454,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6559,8 +6468,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6684,8 +6592,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6695,8 +6602,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6816,8 +6722,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6827,8 +6732,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6948,8 +6852,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6959,8 +6862,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7080,8 +6982,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -7091,8 +6992,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7194,8 +7094,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7203,8 +7102,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7303,8 +7201,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7312,8 +7209,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7412,8 +7308,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7421,8 +7316,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7521,8 +7415,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7530,8 +7423,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7630,8 +7522,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7639,8 +7530,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7739,8 +7629,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7748,8 +7637,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7848,8 +7736,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7857,8 +7744,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7957,8 +7843,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7966,8 +7851,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8066,8 +7950,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -8075,8 +7958,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8193,8 +8075,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8204,8 +8085,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8325,8 +8205,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8336,8 +8215,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8457,8 +8335,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8468,8 +8345,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8580,8 +8456,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8590,8 +8465,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8701,8 +8575,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8711,8 +8584,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8822,8 +8694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8832,8 +8703,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8943,8 +8813,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8953,8 +8822,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9064,8 +8932,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9074,8 +8941,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9185,8 +9051,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9195,8 +9060,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9306,8 +9170,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9316,8 +9179,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9427,8 +9289,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9437,8 +9298,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9548,8 +9408,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9558,8 +9417,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9669,8 +9527,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9679,8 +9536,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9790,8 +9646,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9800,8 +9655,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9911,8 +9765,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9921,8 +9774,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10032,8 +9884,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10042,8 +9893,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10153,8 +10003,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10163,8 +10012,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10274,8 +10122,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10284,8 +10131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10413,8 +10259,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10425,8 +10270,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10558,8 +10402,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10570,8 +10413,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10703,8 +10545,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10715,8 +10556,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10848,8 +10688,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10860,8 +10699,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10993,8 +10831,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11005,8 +10842,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11138,8 +10974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11150,8 +10985,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11283,8 +11117,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11295,8 +11128,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11428,8 +11260,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11440,8 +11271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11573,8 +11403,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11585,8 +11414,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11718,8 +11546,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11730,8 +11557,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11863,8 +11689,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11875,8 +11700,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12008,8 +11832,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12020,8 +11843,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12153,8 +11975,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12165,8 +11986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12298,8 +12118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12310,8 +12129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12443,8 +12261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12455,8 +12272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-nontemporal.ll @@ -133,12 +133,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_nontemporal_load_0: @@ -146,12 +146,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: @@ -291,6 +291,7 @@ ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_nontemporal_load_1: @@ -304,6 +305,7 @@ ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: @@ -435,9 +437,8 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -448,9 +449,8 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(3)* %out) { diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-singlethread.ll @@ -119,8 +119,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -130,8 +129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -251,8 +249,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -262,8 +259,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -383,8 +379,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -394,8 +389,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -515,8 +509,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -526,8 +519,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -629,8 +621,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -638,8 +629,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -738,8 +728,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -747,8 +736,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -847,8 +835,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -856,8 +843,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -956,8 +942,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -965,8 +950,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -1065,8 +1049,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1074,8 +1057,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1174,8 +1156,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1183,8 +1164,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1283,8 +1263,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1292,8 +1271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1392,8 +1370,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1401,8 +1378,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1501,8 +1477,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1510,8 +1485,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1628,8 +1602,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1639,8 +1612,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1760,8 +1732,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1771,8 +1742,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1892,8 +1862,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1903,8 +1872,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -2015,8 +1983,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2025,8 +1992,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2136,8 +2102,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2146,8 +2111,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2257,8 +2221,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2267,8 +2230,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2378,8 +2340,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2388,8 +2349,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2499,8 +2459,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2509,8 +2468,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2620,8 +2578,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2630,8 +2587,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2741,8 +2697,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2751,8 +2706,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2862,8 +2816,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2872,8 +2825,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2983,8 +2935,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2993,8 +2944,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3104,8 +3054,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3114,8 +3063,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3225,8 +3173,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3235,8 +3182,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3346,8 +3292,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3356,8 +3301,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3467,8 +3411,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3477,8 +3420,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3588,8 +3530,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3598,8 +3539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3709,8 +3649,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3719,8 +3658,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3848,8 +3786,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3860,8 +3797,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3993,8 +3929,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4005,8 +3940,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4138,8 +4072,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4150,8 +4083,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4283,8 +4215,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4295,8 +4226,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4428,8 +4358,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4440,8 +4369,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4573,8 +4501,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4585,8 +4512,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4718,8 +4644,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4730,8 +4655,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4863,8 +4787,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,8 +4798,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5008,8 +4930,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5020,8 +4941,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5153,8 +5073,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5165,8 +5084,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5298,8 +5216,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5310,8 +5227,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5443,8 +5359,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5455,8 +5370,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5588,8 +5502,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,8 +5513,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5733,8 +5645,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5745,8 +5656,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5878,8 +5788,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5890,8 +5799,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6014,8 +5922,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6025,8 +5932,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6146,8 +6052,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6157,8 +6062,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6278,8 +6182,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6289,8 +6192,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6410,8 +6312,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6421,8 +6322,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6524,8 +6424,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6533,8 +6432,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6633,8 +6531,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6642,8 +6539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6742,8 +6638,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6751,8 +6646,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6851,8 +6745,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6860,8 +6753,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6960,8 +6852,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6969,8 +6860,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7069,8 +6959,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7078,8 +6967,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7178,8 +7066,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7187,8 +7074,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7287,8 +7173,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7296,8 +7181,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7396,8 +7280,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7405,8 +7288,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7523,8 +7405,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7534,8 +7415,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7655,8 +7535,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7666,8 +7545,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7787,8 +7665,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7798,8 +7675,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7910,8 +7786,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -7920,8 +7795,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8031,8 +7905,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8041,8 +7914,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8152,8 +8024,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8162,8 +8033,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8273,8 +8143,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8283,8 +8152,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8394,8 +8262,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8404,8 +8271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8515,8 +8381,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8525,8 +8390,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8636,8 +8500,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8646,8 +8509,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8757,8 +8619,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8767,8 +8628,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8878,8 +8738,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8888,8 +8747,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8999,8 +8857,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9009,8 +8866,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9120,8 +8976,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9130,8 +8985,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9241,8 +9095,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9251,8 +9104,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9362,8 +9214,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9372,8 +9223,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9483,8 +9333,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9493,8 +9342,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9604,8 +9452,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9614,8 +9461,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9743,8 +9589,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9755,8 +9600,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9888,8 +9732,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9900,8 +9743,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10033,8 +9875,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10045,8 +9886,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10178,8 +10018,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10190,8 +10029,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10323,8 +10161,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10335,8 +10172,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10468,8 +10304,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10480,8 +10315,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10613,8 +10447,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10625,8 +10458,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10758,8 +10590,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10770,8 +10601,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10903,8 +10733,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10915,8 +10744,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11048,8 +10876,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11060,8 +10887,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11193,8 +11019,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11205,8 +11030,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11338,8 +11162,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11350,8 +11173,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11483,8 +11305,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11495,8 +11316,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11628,8 +11448,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11640,8 +11459,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11773,8 +11591,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11785,8 +11602,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-system.ll @@ -119,8 +119,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -130,8 +129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -251,8 +249,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -262,8 +259,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -386,8 +382,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -398,8 +393,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -532,8 +526,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 @@ -546,8 +539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -650,8 +642,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -659,8 +650,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -759,8 +749,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -768,8 +757,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -878,8 +866,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -889,8 +876,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1000,8 +986,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1011,8 +996,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1112,8 +1096,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1121,8 +1104,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1231,8 +1213,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1242,8 +1223,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -1353,8 +1333,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1364,8 +1343,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1485,8 +1463,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1498,8 +1475,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1620,8 +1596,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1633,8 +1608,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1756,8 +1730,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1768,8 +1741,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1902,8 +1874,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -1916,8 +1887,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2051,8 +2021,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -2065,8 +2034,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2178,8 +2146,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2188,8 +2155,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2309,8 +2275,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2321,8 +2286,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,8 +2407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2455,8 +2418,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2587,8 +2549,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2601,8 +2562,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2734,8 +2694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2748,8 +2707,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2871,8 +2829,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2883,8 +2840,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3005,8 +2961,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3017,8 +2972,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3149,8 +3103,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3163,8 +3116,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3296,8 +3248,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3310,8 +3261,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3443,8 +3393,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3457,8 +3406,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3590,8 +3538,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3604,8 +3551,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3737,8 +3683,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3751,8 +3696,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3884,8 +3828,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3898,8 +3841,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4031,8 +3973,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4045,8 +3986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4178,8 +4118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4192,8 +4131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4323,8 +4261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4335,8 +4272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4471,8 +4407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4484,8 +4419,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4627,8 +4561,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4641,8 +4574,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4788,8 +4720,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4803,8 +4734,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4950,8 +4880,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4965,8 +4894,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5102,8 +5030,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5115,8 +5042,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5251,8 +5177,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5264,8 +5189,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5410,8 +5334,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5425,8 +5348,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5572,8 +5494,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5587,8 +5508,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5734,8 +5654,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5749,8 +5668,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5896,8 +5814,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5911,8 +5828,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6058,8 +5974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6073,8 +5988,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6220,8 +6134,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6235,8 +6148,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6382,8 +6294,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6397,8 +6308,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6544,8 +6454,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6559,8 +6468,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6684,8 +6592,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6695,8 +6602,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6816,8 +6722,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6827,8 +6732,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6948,8 +6852,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6959,8 +6862,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7080,8 +6982,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -7091,8 +6992,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7194,8 +7094,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7203,8 +7102,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7303,8 +7201,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7312,8 +7209,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7412,8 +7308,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7421,8 +7316,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7521,8 +7415,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7530,8 +7423,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7630,8 +7522,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7639,8 +7530,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7739,8 +7629,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7748,8 +7637,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7848,8 +7736,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7857,8 +7744,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7957,8 +7843,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7966,8 +7851,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8066,8 +7950,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -8075,8 +7958,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8193,8 +8075,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8204,8 +8085,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8325,8 +8205,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8336,8 +8215,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8457,8 +8335,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8468,8 +8345,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8580,8 +8456,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8590,8 +8465,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8701,8 +8575,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8711,8 +8584,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8822,8 +8694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8832,8 +8703,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8943,8 +8813,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8953,8 +8822,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9064,8 +8932,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9074,8 +8941,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9185,8 +9051,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9195,8 +9060,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9306,8 +9170,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9316,8 +9179,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9427,8 +9289,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9437,8 +9298,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9548,8 +9408,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9558,8 +9417,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9669,8 +9527,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9679,8 +9536,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9790,8 +9646,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9800,8 +9655,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9911,8 +9765,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9921,8 +9774,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10032,8 +9884,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10042,8 +9893,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10153,8 +10003,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10163,8 +10012,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10274,8 +10122,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10284,8 +10131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10413,8 +10259,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10425,8 +10270,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10558,8 +10402,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10570,8 +10413,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10703,8 +10545,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10715,8 +10556,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10848,8 +10688,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10860,8 +10699,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10993,8 +10831,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11005,8 +10842,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11138,8 +10974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11150,8 +10985,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11283,8 +11117,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11295,8 +11128,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11428,8 +11260,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11440,8 +11271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11573,8 +11403,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11585,8 +11414,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11718,8 +11546,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11730,8 +11557,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11863,8 +11689,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11875,8 +11700,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12008,8 +11832,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12020,8 +11843,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12153,8 +11975,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12165,8 +11986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12298,8 +12118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12310,8 +12129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12443,8 +12261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12455,8 +12272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-volatile.ll @@ -81,12 +81,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_volatile_load_0: @@ -94,12 +94,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s2 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, s2 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: @@ -191,6 +191,7 @@ ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: local_volatile_load_1: @@ -204,6 +205,7 @@ ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %in, i32 addrspace(1)* %out) { entry: @@ -287,9 +289,8 @@ ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -300,9 +301,8 @@ ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(3)* %out) { @@ -480,8 +480,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -492,8 +491,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -565,8 +563,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -576,8 +573,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-wavefront.ll @@ -119,8 +119,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -130,8 +129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -251,8 +249,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -262,8 +259,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -383,8 +379,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -394,8 +389,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -515,8 +509,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -526,8 +519,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -629,8 +621,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -638,8 +629,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -738,8 +728,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -747,8 +736,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -847,8 +835,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -856,8 +843,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -956,8 +942,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -965,8 +950,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -1065,8 +1049,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1074,8 +1057,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1174,8 +1156,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1183,8 +1164,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1283,8 +1263,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1292,8 +1271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1392,8 +1370,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1401,8 +1378,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1501,8 +1477,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1510,8 +1485,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1628,8 +1602,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1639,8 +1612,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1760,8 +1732,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1771,8 +1742,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1892,8 +1862,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1903,8 +1872,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -2015,8 +1983,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2025,8 +1992,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2136,8 +2102,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2146,8 +2111,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2257,8 +2221,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2267,8 +2230,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2378,8 +2340,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2388,8 +2349,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2499,8 +2459,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2509,8 +2468,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2620,8 +2578,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2630,8 +2587,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2741,8 +2697,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2751,8 +2706,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2862,8 +2816,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2872,8 +2825,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2983,8 +2935,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2993,8 +2944,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3104,8 +3054,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3114,8 +3063,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3225,8 +3173,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3235,8 +3182,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3346,8 +3292,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3356,8 +3301,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3467,8 +3411,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3477,8 +3420,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3588,8 +3530,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3598,8 +3539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3709,8 +3649,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -3719,8 +3658,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -3848,8 +3786,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3860,8 +3797,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3993,8 +3929,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4005,8 +3940,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4138,8 +4072,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4150,8 +4083,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4283,8 +4215,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4295,8 +4226,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4428,8 +4358,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4440,8 +4369,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4573,8 +4501,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4585,8 +4512,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4718,8 +4644,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4730,8 +4655,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4863,8 +4787,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4875,8 +4798,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5008,8 +4930,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5020,8 +4941,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5153,8 +5073,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5165,8 +5084,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5298,8 +5216,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5310,8 +5227,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5443,8 +5359,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5455,8 +5370,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5588,8 +5502,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5600,8 +5513,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5733,8 +5645,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5745,8 +5656,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5878,8 +5788,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5890,8 +5799,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -6014,8 +5922,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6025,8 +5932,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6146,8 +6052,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6157,8 +6062,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6278,8 +6182,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6289,8 +6192,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6410,8 +6312,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6421,8 +6322,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6524,8 +6424,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6533,8 +6432,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6633,8 +6531,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6642,8 +6539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6742,8 +6638,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6751,8 +6646,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6851,8 +6745,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6860,8 +6753,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -6960,8 +6852,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -6969,8 +6860,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7069,8 +6959,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7078,8 +6967,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7178,8 +7066,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7187,8 +7074,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7287,8 +7173,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7296,8 +7181,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7396,8 +7280,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7405,8 +7288,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7523,8 +7405,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7534,8 +7415,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7655,8 +7535,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7666,8 +7545,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7787,8 +7665,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -7798,8 +7675,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -7910,8 +7786,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -7920,8 +7795,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8031,8 +7905,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8041,8 +7914,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8152,8 +8024,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8162,8 +8033,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8273,8 +8143,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8283,8 +8152,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8394,8 +8262,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8404,8 +8271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8515,8 +8381,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8525,8 +8390,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8636,8 +8500,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8646,8 +8509,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8757,8 +8619,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8767,8 +8628,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8878,8 +8738,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8888,8 +8747,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8999,8 +8857,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9009,8 +8866,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9120,8 +8976,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9130,8 +8985,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9241,8 +9095,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9251,8 +9104,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9362,8 +9214,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9372,8 +9223,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9483,8 +9333,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9493,8 +9342,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9604,8 +9452,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9614,8 +9461,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9743,8 +9589,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9755,8 +9600,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -9888,8 +9732,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -9900,8 +9743,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10033,8 +9875,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10045,8 +9886,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10178,8 +10018,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10190,8 +10029,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10323,8 +10161,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10335,8 +10172,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10468,8 +10304,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10480,8 +10315,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10613,8 +10447,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10625,8 +10458,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10758,8 +10590,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10770,8 +10601,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10903,8 +10733,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10915,8 +10744,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11048,8 +10876,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11060,8 +10887,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11193,8 +11019,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11205,8 +11030,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11338,8 +11162,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11350,8 +11173,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11483,8 +11305,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11495,8 +11316,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11628,8 +11448,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11640,8 +11459,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11773,8 +11591,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11785,8 +11602,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-local-workgroup.ll @@ -119,8 +119,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -130,8 +129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -251,8 +249,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -262,8 +259,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -386,8 +382,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -398,8 +393,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -532,8 +526,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 @@ -546,8 +539,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -650,8 +642,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -659,8 +650,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -759,8 +749,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -768,8 +757,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -878,8 +866,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -889,8 +876,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1000,8 +986,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -1011,8 +996,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1112,8 +1096,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -1121,8 +1104,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -1231,8 +1213,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1242,8 +1223,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_endpgm @@ -1353,8 +1333,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1364,8 +1343,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm @@ -1485,8 +1463,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1498,8 +1475,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1620,8 +1596,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 @@ -1633,8 +1608,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -1756,8 +1730,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: buffer_gl0_inv @@ -1768,8 +1741,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -1902,8 +1874,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -1916,8 +1887,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2051,8 +2021,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 @@ -2065,8 +2034,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2178,8 +2146,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -2188,8 +2155,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -2309,8 +2275,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2321,8 +2286,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -2443,8 +2407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2455,8 +2418,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2587,8 +2549,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2601,8 +2562,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2734,8 +2694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -2748,8 +2707,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -2871,8 +2829,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -2883,8 +2840,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3005,8 +2961,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -3017,8 +2972,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -3149,8 +3103,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3163,8 +3116,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3296,8 +3248,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3310,8 +3261,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3443,8 +3393,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3457,8 +3406,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3590,8 +3538,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3604,8 +3551,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3737,8 +3683,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3751,8 +3696,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -3884,8 +3828,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -3898,8 +3841,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4031,8 +3973,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4045,8 +3986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4178,8 +4118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4192,8 +4131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 @@ -4323,8 +4261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4335,8 +4272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4471,8 +4407,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -4484,8 +4419,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -4627,8 +4561,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4641,8 +4574,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4788,8 +4720,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4803,8 +4734,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -4950,8 +4880,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -4965,8 +4894,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5102,8 +5030,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5115,8 +5042,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5251,8 +5177,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -5264,8 +5189,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -5410,8 +5334,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5425,8 +5348,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5572,8 +5494,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5587,8 +5508,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5734,8 +5654,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5749,8 +5668,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -5896,8 +5814,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5911,8 +5828,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6058,8 +5974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6073,8 +5988,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6220,8 +6134,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6235,8 +6148,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6382,8 +6294,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6397,8 +6308,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6544,8 +6454,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 @@ -6559,8 +6468,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 @@ -6684,8 +6592,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6695,8 +6602,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6816,8 +6722,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6827,8 +6732,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -6948,8 +6852,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -6959,8 +6862,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7080,8 +6982,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_load_b32 v0, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v1, v0 @@ -7091,8 +6992,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_load_b32 v0, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v1, v0 @@ -7194,8 +7094,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7203,8 +7102,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7303,8 +7201,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7312,8 +7209,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7412,8 +7308,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7421,8 +7316,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7521,8 +7415,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7530,8 +7423,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s0 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s1 :: v_dual_mov_b32 v1, s0 ; GFX11-CU-NEXT: ds_store_b32 v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 %in, i32 addrspace(3)* %out) { @@ -7630,8 +7522,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7639,8 +7530,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7739,8 +7629,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7748,8 +7637,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7848,8 +7736,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7857,8 +7744,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -7957,8 +7843,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -7966,8 +7851,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8066,8 +7950,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-WGP-NEXT: s_endpgm ; @@ -8075,8 +7958,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v0, v0, v1 ; GFX11-CU-NEXT: s_endpgm i32 addrspace(3)* %out, i32 %in) { @@ -8193,8 +8075,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8204,8 +8085,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8325,8 +8205,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8336,8 +8215,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8457,8 +8335,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: ds_store_b32 v0, v1 @@ -8468,8 +8345,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: ds_storexchg_rtn_b32 v1, v0, v1 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: ds_store_b32 v0, v1 @@ -8580,8 +8456,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8590,8 +8465,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8701,8 +8575,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8711,8 +8584,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8822,8 +8694,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8832,8 +8703,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -8943,8 +8813,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -8953,8 +8822,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9064,8 +8932,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9074,8 +8941,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9185,8 +9051,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9195,8 +9060,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9306,8 +9170,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9316,8 +9179,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9427,8 +9289,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9437,8 +9298,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9548,8 +9408,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9558,8 +9417,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9669,8 +9527,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9679,8 +9536,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9790,8 +9646,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9800,8 +9655,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -9911,8 +9765,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -9921,8 +9774,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10032,8 +9884,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10042,8 +9893,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10153,8 +10003,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10163,8 +10012,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10274,8 +10122,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_endpgm @@ -10284,8 +10131,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_b32 v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_endpgm @@ -10413,8 +10259,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10425,8 +10270,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10558,8 +10402,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10570,8 +10413,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10703,8 +10545,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10715,8 +10556,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10848,8 +10688,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -10860,8 +10699,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -10993,8 +10831,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11005,8 +10842,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11138,8 +10974,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11150,8 +10985,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11283,8 +11117,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11295,8 +11128,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11428,8 +11260,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11440,8 +11271,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11573,8 +11403,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11585,8 +11414,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11718,8 +11546,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11730,8 +11557,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -11863,8 +11689,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -11875,8 +11700,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12008,8 +11832,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12020,8 +11843,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12153,8 +11975,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12165,8 +11986,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12298,8 +12118,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12310,8 +12129,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) @@ -12443,8 +12261,7 @@ ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-WGP-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-WGP-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) @@ -12455,8 +12272,7 @@ ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b128 s[0:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v0, s0 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1 ; GFX11-CU-NEXT: v_mov_b32_e32 v2, s2 ; GFX11-CU-NEXT: ds_cmpstore_rtn_b32 v1, v0, v1, v2 offset:16 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-nontemporal.ll @@ -162,6 +162,7 @@ ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_nontemporal_load_0: @@ -174,6 +175,7 @@ ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: @@ -331,25 +333,25 @@ ; GFX11-WGP-LABEL: private_nontemporal_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 slc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_nontemporal_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 slc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: @@ -509,6 +511,7 @@ ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_nontemporal_store_0: @@ -521,6 +524,7 @@ ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 glc slc dlc +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: @@ -678,12 +682,12 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_nontemporal_store_1: @@ -691,12 +695,12 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 glc slc dlc +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll --- a/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll +++ b/llvm/test/CodeGen/AMDGPU/memory-legalizer-private-volatile.ll @@ -106,6 +106,7 @@ ; GFX11-WGP-NEXT: scratch_load_b32 v0, off, s2 glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_volatile_load_0: @@ -118,6 +119,7 @@ ; GFX11-CU-NEXT: scratch_load_b32 v0, off, s2 glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: @@ -221,25 +223,25 @@ ; GFX11-WGP-LABEL: private_volatile_load_1: ; GFX11-WGP: ; %bb.0: ; %entry ; GFX11-WGP-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: scratch_load_b32 v0, v0, s2 glc dlc ; GFX11-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX11-WGP-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_volatile_load_1: ; GFX11-CU: ; %bb.0: ; %entry ; GFX11-CU-NEXT: s_load_b32 s2, s[0:1], 0x0 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: s_load_b64 s[0:1], s[0:1], 0x8 -; GFX11-CU-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: scratch_load_b32 v0, v0, s2 glc dlc ; GFX11-CU-NEXT: s_waitcnt vmcnt(0) ; GFX11-CU-NEXT: global_store_b32 v1, v0, s[0:1] +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(5)* %in, i32 addrspace(1)* %out) { entry: @@ -353,6 +355,7 @@ ; GFX11-WGP-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-WGP-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_volatile_store_0: @@ -366,6 +369,7 @@ ; GFX11-CU-NEXT: v_mov_b32_e32 v0, s1 ; GFX11-CU-NEXT: scratch_store_b32 off, v0, s0 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: @@ -474,13 +478,13 @@ ; GFX11-WGP-NEXT: s_clause 0x1 ; GFX11-WGP-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-WGP-NEXT: s_load_b32 s0, s[0:1], 0x8 -; GFX11-WGP-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-WGP-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-WGP-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-WGP-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-WGP-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-WGP-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-WGP-NEXT: s_endpgm ; ; GFX11-CU-LABEL: private_volatile_store_1: @@ -488,13 +492,13 @@ ; GFX11-CU-NEXT: s_clause 0x1 ; GFX11-CU-NEXT: s_load_b64 s[2:3], s[0:1], 0x0 ; GFX11-CU-NEXT: s_load_b32 s0, s[0:1], 0x8 -; GFX11-CU-NEXT: v_lshlrev_b32_e32 v0, 2, v0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-CU-NEXT: s_load_b32 s1, s[2:3], 0x0 ; GFX11-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX11-CU-NEXT: v_dual_mov_b32 v1, s1 :: v_dual_lshlrev_b32 v0, 2, v0 ; GFX11-CU-NEXT: scratch_store_b32 v0, v1, s0 dlc ; GFX11-CU-NEXT: s_waitcnt_vscnt null, 0x0 +; GFX11-CU-NEXT: s_sendmsg sendmsg(MSG_DEALLOC_VGPRS) ; GFX11-CU-NEXT: s_endpgm i32 addrspace(1)* %in, i32 addrspace(5)* %out) { entry: diff --git a/llvm/test/CodeGen/AMDGPU/verify-vopd.mir b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/verify-vopd.mir @@ -0,0 +1,11 @@ +# RUN: not --crash llc -march=amdgcn -mcpu=gfx1100 -mattr=+wavefrontsize32,-wavefrontsize64 -run-pass machineverifier -o /dev/null %s 2>&1 | FileCheck -check-prefix=GFX11-ERR %s + +# GFX11-ERR: *** Bad machine code: VOP* instruction violates constant bus restriction *** +# GFX11-ERR: $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc, implicit $vcc_lo +--- +name: vopd_cndmask_2sgpr +body: | + bb.0: + liveins: $sgpr0, $sgpr1, $vgpr0, $vgpr1 + $vgpr2, $vgpr3 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32 $sgpr0, $vgpr0, $sgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc, implicit $vcc_lo +... diff --git a/llvm/test/CodeGen/AMDGPU/vopd-combine.mir b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir new file mode 100644 --- /dev/null +++ b/llvm/test/CodeGen/AMDGPU/vopd-combine.mir @@ -0,0 +1,543 @@ +# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched %s -o - | FileCheck -check-prefix=SCHED %s +# RUN: llc -march=amdgcn -mcpu=gfx1100 -verify-machineinstrs -run-pass=postmisched,gcn-create-vopd %s -o - | FileCheck -check-prefix=PAIR %s + +--- | + @lds = external addrspace(3) global [8 x i8] + define void @vopd_schedule() { ret void } + define void @vopd_fmamk() { ret void } + define void @vopd_fmamk_fail() { ret void } + define void @vopd_cndmask() { ret void } + define void @vopd_mov() { ret void } + define void @vopd_mov_mov() { ret void } + define void @vopd_constants_fail() { ret void } + define void @vopd_constants_inlinable() { ret void } + define void @vopd_constants_same() { ret void } + define void @vopd_mov_fmaak_constants_same() { ret void } + define void @vopd_debug() { ret void } + define void @vopd_schedule_unconstrained() { ret void } + define void @vopd_schedule_unconstrained_2() { ret void } + define void @vopd_mov_fixup() { ret void } + define void @vopd_mov_fixup_fail() { ret void } +... + +--- +name: vopd_schedule +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_schedule + ; SCHED: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_schedule + ; PAIR: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr4 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; can fuse vgpr3 and vgpr6 writing insts only due to reg constraints + $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + +... + +--- +name: vopd_fmamk +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_fmamk + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_fmamk + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32 killed $vgpr0, 10, killed $vgpr3, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + ; should pair + $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_FMAMK_F32 $vgpr0, 10, $vgpr3, implicit $mode, implicit $exec + +... + +--- +name: vopd_fmamk_fail +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_fmamk_fail + ; SCHED: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr2 = V_XOR_B32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr4 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr4, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_fmamk_fail + ; PAIR: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr2 = V_XOR_B32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr4 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 10, killed $vgpr4, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_FMAC_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_XOR_B32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr4 = IMPLICIT_DEF + ; should not pair + $vgpr2 = V_FMAC_F32_e32 $vgpr1, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_FMAMK_F32 $vgpr0, 10, $vgpr4, implicit $mode, implicit $exec + +... + +--- +name: vopd_cndmask +tracksRegLiveness: true +body: | + bb.0: + ; SCHED-LABEL: name: vopd_cndmask + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $sgpr20 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 $sgpr20, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr5 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-LABEL: name: vopd_cndmask + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $sgpr20 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2, $vgpr5 = V_DUAL_FMAC_F32_e32_X_CNDMASK_B32_e32 $sgpr20, killed $vgpr1, $vgpr0, $vgpr3, implicit $exec, implicit $vcc, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr7 = V_CNDMASK_B32_e32 killed $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr9 = V_CNDMASK_B32_e32 killed $sgpr20, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + $sgpr20 = IMPLICIT_DEF + ; should pair + $vgpr2 = V_FMAC_F32_e32 $sgpr20, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; should not pair, uses 3 scalars (implicit vcc) + $vgpr4 = V_FMAMK_F32 $sgpr20, 12345, $vgpr3, implicit $mode, implicit $exec + $vgpr7 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; should not pair, uses 3 scalars (implicit vcc) + $vgpr6 = V_ADD_F32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec + $vgpr9 = V_CNDMASK_B32_e32 $sgpr20, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + +... + +--- +name: vopd_mov +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_mov + ; SCHED: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $vgpr0, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_mov + ; PAIR: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32 killed $vgpr0, killed $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = V_MOV_B32_e32 $vgpr0, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + +... + +--- +name: vopd_mov_mov +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_mov_mov + ; SCHED: $sgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $sgpr7 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 killed $sgpr0, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_MOV_B32_e32 killed $sgpr7, implicit $exec + ; PAIR-LABEL: name: vopd_mov_mov + ; PAIR: $sgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $sgpr7 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32 killed $sgpr0, killed $sgpr7, implicit $exec, implicit $exec, implicit $exec + $sgpr0 = IMPLICIT_DEF + $sgpr7 = IMPLICIT_DEF + $vgpr2 = V_MOV_B32_e32 $sgpr0, implicit $exec + $vgpr3 = V_MOV_B32_e32 $sgpr7, implicit $exec + +... + + +--- +name: vopd_constants_fail +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_constants_fail + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 99, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_constants_fail + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_FMAC_F32_e32 99, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + ; should not pair with two different literals + $vgpr2 = V_FMAC_F32_e32 99, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_FMAMK_F32 $vgpr0, 100, $vgpr3, implicit $mode, implicit $exec + +... + +--- +name: vopd_constants_inlinable +tracksRegLiveness: true +body: | + bb.0: + ; SCHED-LABEL: name: vopd_constants_inlinable + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 4, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_constants_inlinable + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32 killed $vgpr0, 100, killed $vgpr3, 4, killed $vgpr1, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + ; can pair since 4 is inlinable + $vgpr2 = V_FMAC_F32_e32 4, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_FMAMK_F32 $vgpr0, 100, $vgpr3, implicit $mode, implicit $exec + +... + + +--- +name: vopd_constants_same +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_constants_same + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr5 = V_FMAMK_F32 killed $vgpr0, 100, killed $vgpr3, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 100, killed $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_constants_same + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr5, $vgpr2 = V_DUAL_FMAMK_F32_X_FMAC_F32_e32 killed $vgpr0, 100, killed $vgpr3, 100, killed $vgpr1, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + ; should be able to pair using 1 deduplicated literal + $vgpr2 = V_FMAC_F32_e32 100, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr5 = V_FMAMK_F32 $vgpr0, 100, $vgpr3, implicit $mode, implicit $exec + +... + +--- +name: vopd_mov_fmaak_constants_same +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_mov_fmaak_constants_same + ; SCHED: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $sgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 981467136, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAAK_F32 killed $sgpr0, killed $vgpr0, 981467136, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_mov_fmaak_constants_same + ; PAIR: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $sgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1, $vgpr2 = V_DUAL_MOV_B32_e32_X_FMAAK_F32 981467136, killed $sgpr0, killed $vgpr0, 981467136, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $sgpr0 = IMPLICIT_DEF + ; should be able to pair using 1 deduplicated literal + $vgpr1 = V_MOV_B32_e32 981467136, implicit $exec + $vgpr2 = V_FMAAK_F32 $sgpr0, $vgpr0, 981467136, implicit $mode, implicit $exec + +... + +--- +name: vopd_debug +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_debug + ; SCHED: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: DBG_VALUE 0 + ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_debug + ; PAIR: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 killed $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: DBG_VALUE 0 + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + ; TODO Debug values disable VOPD creation + $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + DBG_VALUE 0 + $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + +... + +--- +name: vopd_schedule_unconstrained +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_schedule_unconstrained + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr17 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_schedule_unconstrained + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr12, $vgpr19 = V_DUAL_ADD_F32_e32_X_CNDMASK_B32_e32 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $exec, implicit $vcc, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr17, $vgpr10 = V_DUAL_MUL_F32_e32_X_CNDMASK_B32_e32 killed $vgpr0, $vgpr0, $vgpr1, $vgpr2, implicit $exec, implicit $vcc, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr14 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr9 = V_FMAMK_F32 $vgpr0, 10, $vgpr2, implicit $mode, implicit $exec + $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr17 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr18 = V_FMAMK_F32 $vgpr0, 10, $vgpr3, implicit $mode, implicit $exec + ; $vgpr11 = V_FMAC_F32_e32 10, $vgpr1, $vgpr11, implicit $mode, implicit $exec + $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr14 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + +... + +--- +name: vopd_schedule_unconstrained_2 +tracksRegLiveness: true +body: | + bb.0: + + ; SCHED-LABEL: name: vopd_schedule_unconstrained_2 + ; SCHED: $vgpr2 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr3 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr20 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr35 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr29 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr20 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr20, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr17 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr37 = V_CNDMASK_B32_e32 $vgpr0, killed $vgpr3, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr14 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr21 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr24 = V_MUL_F32_e32 killed $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; SCHED-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-LABEL: name: vopd_schedule_unconstrained_2 + ; PAIR: $vgpr2 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr3 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr20 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr16, $vgpr35 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr3, $vgpr6 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 $vgpr1, $vgpr1, $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, killed $vgpr2, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr4, $vgpr29 = V_DUAL_SUB_F32_e32_X_CNDMASK_B32_e32 $vgpr1, $vgpr1, $vgpr0, $vgpr3, implicit $exec, implicit $vcc, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr19, $vgpr20 = V_DUAL_CNDMASK_B32_e32_X_FMAC_F32_e32 $vgpr0, $vgpr3, 10, $vgpr1, implicit $exec, implicit $mode, implicit $vcc, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr10, $vgpr17 = V_DUAL_CNDMASK_B32_e32_X_MUL_F32_e32 $vgpr1, $vgpr2, $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $vcc, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr11, $vgpr12 = V_DUAL_CNDMASK_B32_e32_X_ADD_F32_e32 $vgpr0, $vgpr3, $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr37, $vgpr14 = V_DUAL_CNDMASK_B32_e32_X_SUB_F32_e32 $vgpr0, killed $vgpr3, $vgpr1, $vgpr1, implicit $exec, implicit $mode, implicit $vcc, implicit $mode, implicit $exec, implicit $vcc, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr21, $vgpr24 = V_DUAL_SUB_F32_e32_X_MUL_F32_e32 $vgpr1, $vgpr1, killed $vgpr0, $vgpr0, implicit $exec, implicit $mode, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, killed $vgpr2, implicit $mode, implicit $exec, implicit $vcc + ; PAIR-NEXT: $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr32 = V_SUB_F32_e32 killed $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + $vgpr2 = IMPLICIT_DEF + $vgpr3 = IMPLICIT_DEF + $vgpr2 = V_FMAC_F32_e32 10, $vgpr1, $vgpr2, implicit $mode, implicit $exec + $vgpr2 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr3 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr10 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr4 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr6 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr9 = V_FMAMK_F32 $vgpr0, 10, $vgpr2, implicit $mode, implicit $exec + $vgpr11 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr19 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr12 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr17 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr18 = V_FMAMK_F32 $vgpr0, 10, $vgpr3, implicit $mode, implicit $exec + ; $vgpr11 = V_FMAC_F32_e32 10, $vgpr1, $vgpr11, implicit $mode, implicit $exec + $vgpr15 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr16 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr14 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr20 = IMPLICIT_DEF + $vgpr20 = V_FMAC_F32_e32 10, $vgpr1, $vgpr20, implicit $mode, implicit $exec + $vgpr20 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr21 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr28 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr22 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr24 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr9 = V_FMAMK_F32 $vgpr0, 10, $vgpr2, implicit $mode, implicit $exec + $vgpr29 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr37 = V_CNDMASK_B32_e32 $vgpr0, $vgpr3, implicit $mode, implicit $exec, implicit $vcc + $vgpr31 = V_ADD_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr35 = V_MUL_F32_e32 $vgpr0, $vgpr0, implicit $mode, implicit $exec + ; $vgpr18 = V_FMAMK_F32 $vgpr0, 10, $vgpr3, implicit $mode, implicit $exec + ; $vgpr11 = V_FMAC_F32_e32 10, $vgpr1, $vgpr11, implicit $mode, implicit $exec + $vgpr33 = V_CNDMASK_B32_e32 $vgpr1, $vgpr2, implicit $mode, implicit $exec, implicit $vcc + $vgpr34 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + $vgpr32 = V_SUB_F32_e32 $vgpr1, $vgpr1, implicit $mode, implicit $exec + +... + +--- +name: vopd_mov_fixup +tracksRegLiveness: true +body: | + bb.0: + ; SCHED-LABEL: name: vopd_mov_fixup + ; SCHED: $vgpr0 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr1 = IMPLICIT_DEF + ; SCHED-NEXT: $vgpr2 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + ; SCHED-NEXT: $vgpr3 = V_ADD_F32_e32 killed $vgpr0, killed $vgpr1, implicit $mode, implicit $exec + ; SCHED-NEXT: $vgpr4 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + ; SCHED-NEXT: $vgpr5 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + ; PAIR-LABEL: name: vopd_mov_fixup + ; PAIR: $vgpr0 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr1 = IMPLICIT_DEF + ; PAIR-NEXT: $vgpr2, $vgpr3 = V_DUAL_MOV_B32_e32_X_ADD_F32_e32 target-flags(amdgpu-abs32-lo) @lds, killed $vgpr0, killed $vgpr1, implicit $exec, implicit $mode, implicit $exec, implicit $mode, implicit $exec + ; PAIR-NEXT: $vgpr4, $vgpr5 = V_DUAL_MOV_B32_e32_X_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, target-flags(amdgpu-abs32-lo) @lds, implicit $exec, implicit $exec, implicit $exec + $vgpr0 = IMPLICIT_DEF + $vgpr1 = IMPLICIT_DEF + ; should pair + $vgpr2 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + $vgpr3 = V_ADD_F32_e32 $vgpr0, $vgpr1, implicit $mode, implicit $exec + ; should pair + $vgpr4 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + $vgpr5 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec +... + +--- +name: vopd_mov_fixup_fail +tracksRegLiveness: true +body: | + bb.0: + ; SCHED-LABEL: name: vopd_mov_fixup_fail + ; SCHED: $vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + ; SCHED-NEXT: $vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec + ; PAIR-LABEL: name: vopd_mov_fixup_fail + ; PAIR: $vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + ; PAIR-NEXT: $vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec + $vgpr0 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds, implicit $exec + $vgpr1 = V_MOV_B32_e32 target-flags(amdgpu-abs32-lo) @lds + 4, implicit $exec +... diff --git a/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll b/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll --- a/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll +++ b/llvm/test/CodeGen/AMDGPU/wqm-gfx11.ll @@ -12,10 +12,8 @@ ; CHECK-NEXT: lds_param_load v4, attr0.y wait_vdst:15 ; CHECK-NEXT: lds_param_load v5, attr0.z wait_vdst:15 ; CHECK-NEXT: s_mov_b32 exec_lo, s0 -; CHECK-NEXT: s_waitcnt expcnt(2) -; CHECK-NEXT: v_add_f32_e32 v0, v3, v0 ; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: v_add_f32_e32 v1, v4, v1 +; CHECK-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_add_f32 v1, v4, v1 ; CHECK-NEXT: s_waitcnt expcnt(0) ; CHECK-NEXT: v_add_f32_e32 v2, v5, v2 ; CHECK-NEXT: ; return to shader part epilog @@ -43,10 +41,8 @@ ; CHECK-NEXT: s_mov_b32 m0, s2 ; CHECK-NEXT: lds_direct_load v5 wait_vdst:15 ; CHECK-NEXT: s_mov_b32 exec_lo, s0 -; CHECK-NEXT: s_waitcnt expcnt(2) -; CHECK-NEXT: v_add_f32_e32 v0, v3, v0 ; CHECK-NEXT: s_waitcnt expcnt(1) -; CHECK-NEXT: v_add_f32_e32 v1, v4, v1 +; CHECK-NEXT: v_dual_add_f32 v0, v3, v0 :: v_dual_add_f32 v1, v4, v1 ; CHECK-NEXT: s_waitcnt expcnt(0) ; CHECK-NEXT: v_add_f32_e32 v2, v5, v2 ; CHECK-NEXT: ; return to shader part epilog