Index: llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp +++ llvm/lib/Target/AMDGPU/SIFormMemoryClauses.cpp @@ -6,10 +6,12 @@ // //===----------------------------------------------------------------------===// // -/// \file -/// This pass creates bundles of SMEM and VMEM instructions forming memory -/// clauses if XNACK is enabled. Def operands of clauses are marked as early -/// clobber to make sure we will not override any source within a clause. +/// \file This pass extends the live ranges extends the live ranges of registers +/// used as pointers in sequences of adjacent of SMEM and VMEM instructions if +/// XNACK is enabled. A load that would overwrite a pointer would require +/// breaking the soft clause. Artificially extend the life ranges of the pointer +/// operands by adding implicit-def early-clobber operands throughout the soft +/// clause. /// //===----------------------------------------------------------------------===// @@ -59,9 +61,6 @@ } private: - template - void forAllLanes(Register Reg, LaneBitmask LaneMask, Callable Func) const; - bool canBundle(const MachineInstr &MI, const RegUse &Defs, const RegUse &Uses) const; bool checkPressure(const MachineInstr &MI, GCNDownwardRPTracker &RPT); @@ -149,59 +148,6 @@ return S; } -template -void SIFormMemoryClauses::forAllLanes(Register Reg, LaneBitmask LaneMask, - Callable Func) const { - if (LaneMask.all() || Reg.isPhysical() || - LaneMask == MRI->getMaxLaneMaskForVReg(Reg)) { - Func(0); - return; - } - - const TargetRegisterClass *RC = MRI->getRegClass(Reg); - unsigned E = TRI->getNumSubRegIndices(); - SmallVector CoveringSubregs; - for (unsigned Idx = 1; Idx < E; ++Idx) { - // Is this index even compatible with the given class? - if (TRI->getSubClassWithSubReg(RC, Idx) != RC) - continue; - LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); - // Early exit if we found a perfect match. - if (SubRegMask == LaneMask) { - Func(Idx); - return; - } - - if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none()) - continue; - - CoveringSubregs.push_back(Idx); - } - - llvm::sort(CoveringSubregs, [this](unsigned A, unsigned B) { - LaneBitmask MaskA = TRI->getSubRegIndexLaneMask(A); - LaneBitmask MaskB = TRI->getSubRegIndexLaneMask(B); - unsigned NA = MaskA.getNumLanes(); - unsigned NB = MaskB.getNumLanes(); - if (NA != NB) - return NA > NB; - return MaskA.getHighestLane() > MaskB.getHighestLane(); - }); - - for (unsigned Idx : CoveringSubregs) { - LaneBitmask SubRegMask = TRI->getSubRegIndexLaneMask(Idx); - if ((SubRegMask & ~LaneMask).any() || (SubRegMask & LaneMask).none()) - continue; - - Func(Idx); - LaneMask &= ~SubRegMask; - if (LaneMask.none()) - return; - } - - llvm_unreachable("Failed to find all subregs to cover lane mask"); -} - // Returns false if there is a use of a def already in the map. // In this case we must break the clause. bool SIFormMemoryClauses::canBundle(const MachineInstr &MI, const RegUse &Defs, @@ -332,8 +278,6 @@ unsigned FuncMaxClause = AMDGPU::getIntegerAttribute( MF.getFunction(), "amdgpu-max-memory-clause", MaxClause); - SmallVector DbgInstrs; - for (MachineBasicBlock &MBB : MF) { GCNDownwardRPTracker RPT(*LIS); MachineBasicBlock::instr_iterator Next; @@ -341,7 +285,7 @@ MachineInstr &MI = *I; Next = std::next(I); - if (MI.isDebugInstr()) + if (MI.isMetaInstruction()) continue; bool IsVMEM = isVMEMClauseInst(MI); @@ -363,11 +307,11 @@ continue; } + MachineBasicBlock::iterator LastClauseInst = Next; unsigned Length = 1; for ( ; Next != E && Length < FuncMaxClause; ++Next) { - // Debug instructions should not change the bundling. We need to move - // these after the bundle - if (Next->isDebugInstr()) + // Debug instructions should not change the kill insertion. + if (Next->isMetaInstruction()) continue; if (!isValidClauseInst(*Next, IsVMEM)) @@ -379,6 +323,7 @@ if (!processRegUses(*Next, Defs, Uses, RPT)) break; + LastClauseInst = Next; ++Length; } if (Length < 2) { @@ -389,48 +334,73 @@ Changed = true; MFI->limitOccupancy(LastRecordedOccupancy); - auto B = BuildMI(MBB, I, DebugLoc(), TII->get(TargetOpcode::BUNDLE)); - Ind->insertMachineInstrInMaps(*B); + assert(!LastClauseInst->isMetaInstruction()); - // Restore the state after processing the bundle. - RPT.reset(*B, &LiveRegsCopy); - DbgInstrs.clear(); + SlotIndex ClauseLiveInIdx = LIS->getInstructionIndex(MI); + SlotIndex ClauseLiveOutIdx = + LIS->getInstructionIndex(*LastClauseInst).getNextIndex(); - auto BundleNext = I; - for (auto BI = I; BI != Next; BI = BundleNext) { - BundleNext = std::next(BI); + // Track the last inserted kill. + MachineInstrBuilder Kill; - if (BI->isDebugValue()) { - DbgInstrs.push_back(BI->removeFromParent()); + // Insert one kill per register, with operands covering all necessary + // subregisters. + for (auto &&R : Uses) { + Register Reg = R.first; + if (Reg.isPhysical()) continue; + + // Collect the register operands we should extend the live ranges of. + SmallVector> KillOps; + const LiveInterval &LI = LIS->getInterval(R.first); + + if (!LI.hasSubRanges()) { + if (!LI.liveAt(ClauseLiveOutIdx)) { + KillOps.emplace_back(R.second.first | RegState::Kill, + AMDGPU::NoSubRegister); + } + } else { + LaneBitmask KilledMask; + for (const LiveInterval::SubRange &SR : LI.subranges()) { + if (SR.liveAt(ClauseLiveInIdx) && !SR.liveAt(ClauseLiveOutIdx)) + KilledMask |= SR.LaneMask; + } + + if (KilledMask.none()) + continue; + + SmallVector KilledIndexes; + bool Success = TRI->getCoveringSubRegIndexes( + *MRI, MRI->getRegClass(Reg), KilledMask, KilledIndexes); + (void)Success; + assert(Success && "Failed to find subregister mask to cover lanes"); + for (unsigned SubReg : KilledIndexes) { + KillOps.emplace_back(R.second.first | RegState::Kill, SubReg); + } } - BI->bundleWithPred(); - Ind->removeSingleMachineInstrFromMaps(*BI); + if (KillOps.empty()) + continue; - for (MachineOperand &MO : BI->defs()) - if (MO.readsReg()) - MO.setIsInternalRead(true); + // We only want to extend the live ranges of used registers. If they + // already have existing uses beyond the bundle, we don't need the kill. + // + // It's possible all of the use registers were already live past the + // bundle. + Kill = BuildMI(*MI.getParent(), std::next(LastClauseInst), + DebugLoc(), TII->get(AMDGPU::KILL)); + for (auto &Op : KillOps) + Kill.addUse(Reg, std::get<0>(Op), std::get<1>(Op)); + Ind->insertMachineInstrInMaps(*Kill); } - // Replace any debug instructions after the new bundle. - for (MachineInstr *DbgInst : DbgInstrs) - MBB.insert(Next, DbgInst); - - for (auto &&R : Defs) { - forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) { - unsigned S = R.second.first | RegState::EarlyClobber; - if (!SubReg) - S &= ~(RegState::Undef | RegState::Dead); - B.addDef(R.first, S, SubReg); - }); + if (!Kill) { + RPT.reset(MI, &LiveRegsCopy); + continue; } - for (auto &&R : Uses) { - forAllLanes(R.first, R.second.second, [&R, &B](unsigned SubReg) { - B.addUse(R.first, R.second.first & ~RegState::Kill, SubReg); - }); - } + // Restore the state after processing the end of the bundle. + RPT.reset(*Kill, &LiveRegsCopy); for (auto &&R : Defs) { Register Reg = R.first; Index: llvm/lib/Target/AMDGPU/SIPostRABundler.cpp =================================================================== --- llvm/lib/Target/AMDGPU/SIPostRABundler.cpp +++ llvm/lib/Target/AMDGPU/SIPostRABundler.cpp @@ -173,27 +173,33 @@ // clauses to hint register allocation. Look for kills that look like // this, and erase them. if (Next != E && Next->isKill()) { - MachineInstr &Kill = *Next; // TODO: Should maybe back-propagate kill flags to the bundle. for (const MachineInstr &BundleMI : make_range(BundleStart, Next)) collectUsedRegUnits(BundleMI, BundleUsedRegUnits); - collectUsedRegUnits(Kill, KillUsedRegUnits); BundleUsedRegUnits.flip(); - KillUsedRegUnits &= BundleUsedRegUnits; - // Erase the kill if it's a subset of the used registers. - // - // TODO: Should we just remove all kills? Is there any real reason to - // keep them after RA? - if (KillUsedRegUnits.none()) { - ++Next; - Kill.eraseFromParent(); + while (Next != E && Next->isKill()) { + MachineInstr &Kill = *Next; + collectUsedRegUnits(Kill, KillUsedRegUnits); + + KillUsedRegUnits &= BundleUsedRegUnits; + + // Erase the kill if it's a subset of the used registers. + // + // TODO: Should we just remove all kills? Is there any real reason to + // keep them after RA? + if (KillUsedRegUnits.none()) { + ++Next; + Kill.eraseFromParent(); + } else + break; + + KillUsedRegUnits.reset(); } BundleUsedRegUnits.reset(); - KillUsedRegUnits.reset(); } finalizeBundle(MBB, BundleStart, Next); Index: llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/extractelement.i128.ll @@ -94,57 +94,57 @@ ; GFX9-LABEL: extractelement_vgpr_v4i128_vgpr_idx: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 -; GFX9-NEXT: v_lshlrev_b32_e32 v2, 1, v2 -; GFX9-NEXT: v_add_u32_e32 v3, 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v2 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v3 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v2 +; GFX9-NEXT: v_lshlrev_b32_e32 v16, 1, v2 +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 +; GFX9-NEXT: v_add_u32_e32 v17, 1, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 1, v16 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 1, v17 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[6:7], 6, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], 7, v16 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e64 v12, v8, v10, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v13, v9, v11, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e32 v8, v8, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v9, v9, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v10, v2, v4, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v3, v5, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v5, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v16 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e32 v10, v12, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v11, v13, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v8, v4, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v9, v5, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v2 -; GFX9-NEXT: v_cndmask_b32_e32 v16, v10, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v17, v11, v7, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v4, v10, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v11, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 2, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v6, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v7, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v16 +; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc ; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 ; GFX9-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 3, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v6, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v7, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v2 -; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v16 +; GFX9-NEXT: v_cmp_eq_u32_e64 s[4:5], 7, v17 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v0, v16, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v1, v17, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v8, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v9, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v2 +; GFX9-NEXT: v_cndmask_b32_e32 v0, v4, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 4, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v8, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v9, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v16 ; GFX9-NEXT: v_cndmask_b32_e32 v0, v0, v10, vcc ; GFX9-NEXT: v_cndmask_b32_e32 v1, v1, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v3 -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v10, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v5, v5, v11, vcc -; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v3 +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 5, v17 +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v11, vcc +; GFX9-NEXT: v_cmp_eq_u32_e32 vcc, 6, v17 ; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: v_cndmask_b32_e32 v2, v2, v12, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v12, s[6:7] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v13, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e32 v3, v5, v13, vcc -; GFX9-NEXT: v_cndmask_b32_e32 v4, v4, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e32 v3, v3, v13, vcc ; GFX9-NEXT: v_cndmask_b32_e64 v0, v0, v14, s[8:9] ; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v15, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v14, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v2, v14, s[4:5] ; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v15, s[4:5] ; GFX9-NEXT: s_setpc_b64 s[30:31] ; Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement-stack-lower.ll @@ -231,6 +231,21 @@ ; GCN-NEXT: buffer_load_dword v13, v28, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v14, v29, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v15, v30, s[0:3], 0 offen +; GCN-NEXT: ; kill: killed $vgpr30 +; GCN-NEXT: ; kill: killed $vgpr19 +; GCN-NEXT: ; kill: killed $vgpr23 +; GCN-NEXT: ; kill: killed $vgpr27 +; GCN-NEXT: ; kill: killed $vgpr20 +; GCN-NEXT: ; kill: killed $vgpr24 +; GCN-NEXT: ; kill: killed $vgpr28 +; GCN-NEXT: ; kill: killed $vgpr17 +; GCN-NEXT: ; kill: killed $vgpr21 +; GCN-NEXT: ; kill: killed $vgpr25 +; GCN-NEXT: ; kill: killed $vgpr0 +; GCN-NEXT: ; kill: killed $vgpr29 +; GCN-NEXT: ; kill: killed $vgpr18 +; GCN-NEXT: ; kill: killed $vgpr22 +; GCN-NEXT: ; kill: killed $vgpr26 ; GCN-NEXT: buffer_load_dword v16, v31, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v17, v32, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v18, v33, s[0:3], 0 offen @@ -246,6 +261,21 @@ ; GCN-NEXT: buffer_load_dword v28, v43, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v29, v44, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v30, v45, s[0:3], 0 offen +; GCN-NEXT: ; kill: killed $vgpr45 +; GCN-NEXT: ; kill: killed $vgpr34 +; GCN-NEXT: ; kill: killed $vgpr38 +; GCN-NEXT: ; kill: killed $vgpr42 +; GCN-NEXT: ; kill: killed $vgpr31 +; GCN-NEXT: ; kill: killed $vgpr35 +; GCN-NEXT: ; kill: killed $vgpr39 +; GCN-NEXT: ; kill: killed $vgpr43 +; GCN-NEXT: ; kill: killed $vgpr32 +; GCN-NEXT: ; kill: killed $vgpr36 +; GCN-NEXT: ; kill: killed $vgpr40 +; GCN-NEXT: ; kill: killed $vgpr44 +; GCN-NEXT: ; kill: killed $vgpr33 +; GCN-NEXT: ; kill: killed $vgpr37 +; GCN-NEXT: ; kill: killed $vgpr41 ; GCN-NEXT: buffer_load_dword v31, v46, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v32, v47, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v33, v48, s[0:3], 0 offen @@ -261,6 +291,21 @@ ; GCN-NEXT: buffer_load_dword v43, v58, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v44, v59, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v45, v60, s[0:3], 0 offen +; GCN-NEXT: ; kill: killed $vgpr60 +; GCN-NEXT: ; kill: killed $vgpr49 +; GCN-NEXT: ; kill: killed $vgpr53 +; GCN-NEXT: ; kill: killed $vgpr57 +; GCN-NEXT: ; kill: killed $vgpr46 +; GCN-NEXT: ; kill: killed $vgpr50 +; GCN-NEXT: ; kill: killed $vgpr54 +; GCN-NEXT: ; kill: killed $vgpr58 +; GCN-NEXT: ; kill: killed $vgpr47 +; GCN-NEXT: ; kill: killed $vgpr51 +; GCN-NEXT: ; kill: killed $vgpr55 +; GCN-NEXT: ; kill: killed $vgpr59 +; GCN-NEXT: ; kill: killed $vgpr48 +; GCN-NEXT: ; kill: killed $vgpr52 +; GCN-NEXT: ; kill: killed $vgpr56 ; GCN-NEXT: buffer_load_dword v46, v61, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v47, v62, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v48, v63, s[0:3], 0 offen @@ -276,9 +321,27 @@ ; GCN-NEXT: buffer_load_dword v58, v73, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v59, v74, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v60, v75, s[0:3], 0 offen +; GCN-NEXT: ; kill: killed $vgpr64 +; GCN-NEXT: ; kill: killed $vgpr68 +; GCN-NEXT: ; kill: killed $vgpr72 +; GCN-NEXT: ; kill: killed $vgpr61 +; GCN-NEXT: ; kill: killed $vgpr65 +; GCN-NEXT: ; kill: killed $vgpr69 +; GCN-NEXT: ; kill: killed $vgpr73 +; GCN-NEXT: ; kill: killed $vgpr62 +; GCN-NEXT: ; kill: killed $vgpr66 +; GCN-NEXT: ; kill: killed $vgpr70 +; GCN-NEXT: ; kill: killed $vgpr74 +; GCN-NEXT: ; kill: killed $vgpr63 +; GCN-NEXT: ; kill: killed $vgpr67 +; GCN-NEXT: ; kill: killed $vgpr71 +; GCN-NEXT: ; kill: killed $vgpr75 ; GCN-NEXT: buffer_load_dword v61, v76, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v62, v77, s[0:3], 0 offen ; GCN-NEXT: buffer_load_dword v63, v78, s[0:3], 0 offen +; GCN-NEXT: ; kill: killed $vgpr76 +; GCN-NEXT: ; kill: killed $vgpr77 +; GCN-NEXT: ; kill: killed $vgpr78 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:256 ; GCN-NEXT: v_mov_b32_e32 v64, 0 ; GCN-NEXT: s_waitcnt vmcnt(0) Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.i16.ll @@ -2133,8 +2133,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_s(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v16i16_s_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[2:5], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[6:9], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s3, 1 ; GFX9-NEXT: s_lshr_b32 s12, s3, 1 ; GFX9-NEXT: s_mov_b32 s0, 0xffff @@ -2152,26 +2152,26 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v2, v3, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] -; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[10:11] +; GFX9-NEXT: v_and_or_b32 v10, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v2, v10, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v4, v10, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v10, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v5, v10, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v6, v10, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v7, v10, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v8, v10, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v9, v10, s[10:11] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off @@ -2843,8 +2843,8 @@ define amdgpu_ps void @insertelement_v_v16i16_s_v(<16 x i16> addrspace(1)* %ptr, i16 inreg %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v16i16_s_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v2 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v2 ; GFX9-NEXT: s_mov_b32 s0, 0xffff @@ -2862,25 +2862,25 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v11, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v6, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] -; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v9, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v11, v11, v10, s[10:11] +; GFX9-NEXT: v_and_or_b32 v11, v11, v1, v2 +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off @@ -2992,8 +2992,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_s(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 inreg %idx) { ; GFX9-LABEL: insertelement_v_v16i16_v_s: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[3:6], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[7:10], v[0:1], off offset:16 ; GFX9-NEXT: s_and_b32 s1, s2, 1 ; GFX9-NEXT: s_lshr_b32 s12, s2, 1 ; GFX9-NEXT: s_lshl_b32 s1, s1, 4 @@ -3009,26 +3009,26 @@ ; GFX9-NEXT: v_cmp_eq_u32_e64 s[8:9], s12, 6 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[10:11], s12, 7 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v1, v8, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v11, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v3, v4, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v6, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[10:11] -; GFX9-NEXT: v_and_or_b32 v12, v1, s13, v0 +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v7, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v8, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v9, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v1, v1, v10, s[10:11] +; GFX9-NEXT: v_and_or_b32 v11, v1, s13, v0 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], s12, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v2, v5, v11, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v0, v3, v11, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v6, v11, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v8, v11, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v9, v11, s[8:9] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v4, v11, vcc ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v4, v7, v11, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v10, v11, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off @@ -3140,8 +3140,8 @@ define amdgpu_ps void @insertelement_v_v16i16_v_v(<16 x i16> addrspace(1)* %ptr, i16 %val, i32 %idx) { ; GFX9-LABEL: insertelement_v_v16i16_v_v: ; GFX9: ; %bb.0: -; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off -; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 +; GFX9-NEXT: global_load_dwordx4 v[4:7], v[0:1], off +; GFX9-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:16 ; GFX9-NEXT: v_lshrrev_b32_e32 v0, 1, v3 ; GFX9-NEXT: v_and_b32_e32 v1, 1, v3 ; GFX9-NEXT: v_lshlrev_b32_e32 v1, 4, v1 @@ -3158,25 +3158,25 @@ ; GFX9-NEXT: v_xor_b32_e32 v1, -1, v1 ; GFX9-NEXT: v_cmp_eq_u32_e64 s[12:13], 0, v0 ; GFX9-NEXT: s_waitcnt vmcnt(1) -; GFX9-NEXT: v_cndmask_b32_e32 v3, v8, v9, vcc -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e32 v3, v4, v5, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[2:3] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v4, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v5, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v6, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v7, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v8, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v9, s[6:7] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v10, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v3, v11, s[10:11] ; GFX9-NEXT: v_and_or_b32 v12, v3, v1, v2 -; GFX9-NEXT: v_cndmask_b32_e64 v0, v8, v12, s[12:13] -; GFX9-NEXT: v_cndmask_b32_e32 v1, v9, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v0, v4, v12, s[12:13] +; GFX9-NEXT: v_cndmask_b32_e32 v1, v5, v12, vcc +; GFX9-NEXT: v_cndmask_b32_e64 v4, v8, v12, s[4:5] +; GFX9-NEXT: v_cndmask_b32_e64 v5, v9, v12, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v8, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v2, v10, v12, s[0:1] -; GFX9-NEXT: v_cndmask_b32_e64 v3, v11, v12, s[2:3] +; GFX9-NEXT: v_cndmask_b32_e64 v2, v6, v12, s[0:1] +; GFX9-NEXT: v_cndmask_b32_e64 v3, v7, v12, s[2:3] ; GFX9-NEXT: v_mov_b32_e32 v9, 0 -; GFX9-NEXT: v_cndmask_b32_e64 v4, v4, v12, s[4:5] -; GFX9-NEXT: v_cndmask_b32_e64 v5, v5, v12, s[6:7] -; GFX9-NEXT: v_cndmask_b32_e64 v6, v6, v12, s[8:9] -; GFX9-NEXT: v_cndmask_b32_e64 v7, v7, v12, s[10:11] +; GFX9-NEXT: v_cndmask_b32_e64 v6, v10, v12, s[8:9] +; GFX9-NEXT: v_cndmask_b32_e64 v7, v11, v12, s[10:11] ; GFX9-NEXT: s_mov_b64 s[0:1], 16 ; GFX9-NEXT: v_mov_b32_e32 v10, 0 ; GFX9-NEXT: global_store_dwordx4 v[8:9], v[0:3], off Index: llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/insertelement.large.ll @@ -5,62 +5,61 @@ ; GCN-LABEL: v_insert_v64i32_37: ; GCN: ; %bb.0: ; GCN-NEXT: s_load_dwordx4 s[0:3], s[4:5], 0x0 -; GCN-NEXT: v_lshlrev_b32_e32 v64, 8, v0 +; GCN-NEXT: v_lshlrev_b32_e32 v68, 8, v0 ; GCN-NEXT: s_movk_i32 s4, 0x80 ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: v_mov_b32_e32 v0, s0 ; GCN-NEXT: v_mov_b32_e32 v1, s1 -; GCN-NEXT: v_add_co_u32_e32 v4, vcc, v0, v64 -; GCN-NEXT: v_addc_co_u32_e32 v5, vcc, 0, v1, vcc -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v4 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v5, vcc +; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v0, v68 +; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, 0, v1, vcc +; GCN-NEXT: v_add_co_u32_e32 v0, vcc, 64, v2 +; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, 0, v3, vcc ; GCN-NEXT: global_load_dwordx4 v[32:35], v[0:1], off offset:16 ; GCN-NEXT: global_load_dwordx4 v[36:39], v[0:1], off offset:32 ; GCN-NEXT: global_load_dwordx4 v[40:43], v[0:1], off offset:48 ; GCN-NEXT: v_mov_b32_e32 v0, s4 ; GCN-NEXT: v_mov_b32_e32 v1, s5 -; GCN-NEXT: v_add_co_u32_e32 v0, vcc, v4, v0 +; GCN-NEXT: v_add_co_u32_e32 v64, vcc, v2, v0 ; GCN-NEXT: s_movk_i32 s4, 0xc0 -; GCN-NEXT: v_mov_b32_e32 v2, s4 -; GCN-NEXT: v_addc_co_u32_e32 v1, vcc, v5, v1, vcc -; GCN-NEXT: v_mov_b32_e32 v3, s5 -; GCN-NEXT: v_add_co_u32_e32 v2, vcc, v4, v2 -; GCN-NEXT: v_addc_co_u32_e32 v3, vcc, v5, v3, vcc -; GCN-NEXT: global_load_dwordx4 v[44:47], v64, s[0:1] -; GCN-NEXT: global_load_dwordx4 v[48:51], v64, s[0:1] offset:16 -; GCN-NEXT: global_load_dwordx4 v[52:55], v64, s[0:1] offset:32 -; GCN-NEXT: global_load_dwordx4 v[56:59], v64, s[0:1] offset:48 -; GCN-NEXT: global_load_dwordx4 v[60:63], v64, s[0:1] offset:64 -; GCN-NEXT: global_load_dwordx4 v[4:7], v[0:1], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[8:11], v[0:1], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[12:15], v[0:1], off offset:48 -; GCN-NEXT: global_load_dwordx4 v[20:23], v[2:3], off offset:16 -; GCN-NEXT: global_load_dwordx4 v[24:27], v[2:3], off offset:32 -; GCN-NEXT: global_load_dwordx4 v[28:31], v[2:3], off offset:48 -; GCN-NEXT: s_nop 0 -; GCN-NEXT: global_load_dwordx4 v[0:3], v64, s[0:1] offset:128 -; GCN-NEXT: global_load_dwordx4 v[16:19], v64, s[0:1] offset:192 +; GCN-NEXT: v_addc_co_u32_e32 v65, vcc, v3, v1, vcc +; GCN-NEXT: v_mov_b32_e32 v0, s4 +; GCN-NEXT: v_mov_b32_e32 v1, s5 +; GCN-NEXT: v_add_co_u32_e32 v66, vcc, v2, v0 +; GCN-NEXT: v_addc_co_u32_e32 v67, vcc, v3, v1, vcc +; GCN-NEXT: global_load_dwordx4 v[44:47], v68, s[0:1] +; GCN-NEXT: global_load_dwordx4 v[48:51], v68, s[0:1] offset:16 +; GCN-NEXT: global_load_dwordx4 v[52:55], v68, s[0:1] offset:32 +; GCN-NEXT: global_load_dwordx4 v[56:59], v68, s[0:1] offset:48 +; GCN-NEXT: global_load_dwordx4 v[60:63], v68, s[0:1] offset:64 +; GCN-NEXT: global_load_dwordx4 v[4:7], v[64:65], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[8:11], v[64:65], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[12:15], v[64:65], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[20:23], v[66:67], off offset:16 +; GCN-NEXT: global_load_dwordx4 v[24:27], v[66:67], off offset:32 +; GCN-NEXT: global_load_dwordx4 v[28:31], v[66:67], off offset:48 +; GCN-NEXT: global_load_dwordx4 v[0:3], v68, s[0:1] offset:128 +; GCN-NEXT: global_load_dwordx4 v[16:19], v68, s[0:1] offset:192 ; GCN-NEXT: s_waitcnt vmcnt(7) ; GCN-NEXT: v_mov_b32_e32 v5, 0x3e7 ; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: global_store_dwordx4 v64, v[0:3], s[2:3] offset:128 -; GCN-NEXT: global_store_dwordx4 v64, v[4:7], s[2:3] offset:144 -; GCN-NEXT: global_store_dwordx4 v64, v[8:11], s[2:3] offset:160 -; GCN-NEXT: global_store_dwordx4 v64, v[12:15], s[2:3] offset:176 +; GCN-NEXT: global_store_dwordx4 v68, v[0:3], s[2:3] offset:128 +; GCN-NEXT: global_store_dwordx4 v68, v[4:7], s[2:3] offset:144 +; GCN-NEXT: global_store_dwordx4 v68, v[8:11], s[2:3] offset:160 +; GCN-NEXT: global_store_dwordx4 v68, v[12:15], s[2:3] offset:176 ; GCN-NEXT: s_waitcnt vmcnt(4) -; GCN-NEXT: global_store_dwordx4 v64, v[16:19], s[2:3] offset:192 -; GCN-NEXT: global_store_dwordx4 v64, v[20:23], s[2:3] offset:208 -; GCN-NEXT: global_store_dwordx4 v64, v[24:27], s[2:3] offset:224 -; GCN-NEXT: global_store_dwordx4 v64, v[44:47], s[2:3] -; GCN-NEXT: global_store_dwordx4 v64, v[48:51], s[2:3] offset:16 -; GCN-NEXT: global_store_dwordx4 v64, v[52:55], s[2:3] offset:32 -; GCN-NEXT: global_store_dwordx4 v64, v[56:59], s[2:3] offset:48 -; GCN-NEXT: global_store_dwordx4 v64, v[60:63], s[2:3] offset:64 -; GCN-NEXT: global_store_dwordx4 v64, v[28:31], s[2:3] offset:240 -; GCN-NEXT: global_store_dwordx4 v64, v[32:35], s[2:3] offset:80 -; GCN-NEXT: global_store_dwordx4 v64, v[36:39], s[2:3] offset:96 -; GCN-NEXT: global_store_dwordx4 v64, v[40:43], s[2:3] offset:112 +; GCN-NEXT: global_store_dwordx4 v68, v[16:19], s[2:3] offset:192 +; GCN-NEXT: global_store_dwordx4 v68, v[20:23], s[2:3] offset:208 +; GCN-NEXT: global_store_dwordx4 v68, v[24:27], s[2:3] offset:224 +; GCN-NEXT: global_store_dwordx4 v68, v[44:47], s[2:3] +; GCN-NEXT: global_store_dwordx4 v68, v[48:51], s[2:3] offset:16 +; GCN-NEXT: global_store_dwordx4 v68, v[52:55], s[2:3] offset:32 +; GCN-NEXT: global_store_dwordx4 v68, v[56:59], s[2:3] offset:48 +; GCN-NEXT: global_store_dwordx4 v68, v[60:63], s[2:3] offset:64 +; GCN-NEXT: global_store_dwordx4 v68, v[28:31], s[2:3] offset:240 +; GCN-NEXT: global_store_dwordx4 v68, v[32:35], s[2:3] offset:80 +; GCN-NEXT: global_store_dwordx4 v68, v[36:39], s[2:3] offset:96 +; GCN-NEXT: global_store_dwordx4 v68, v[40:43], s[2:3] offset:112 ; GCN-NEXT: s_endpgm %id = call i32 @llvm.amdgcn.workitem.id.x() %gep.in = getelementptr <64 x i32>, <64 x i32> addrspace(1)* %ptr.in, i32 %id Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.div.fmas.ll @@ -252,17 +252,17 @@ ; GFX10_W32-LABEL: test_div_fmas_f32: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x4 -; GFX10_W32-NEXT: s_load_dword s7, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0xb8 ; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W32-NEXT: s_load_dword s7, s[0:1], 0x4c ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s0, 1, s7 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s7, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm @@ -270,17 +270,17 @@ ; GFX10_W64-LABEL: test_div_fmas_f32: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x4 -; GFX10_W64-NEXT: s_load_dword s7, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0xb8 ; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W64-NEXT: s_load_dword s7, s[0:1], 0x4c ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s0, 1, s7 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s7, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm @@ -329,32 +329,32 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0xb8 ; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x70 ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10_W32-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, 1.0, s6, v0 ; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_0: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0xb8 ; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x70 ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX10_W64-NEXT: v_div_fmas_f32 v0, 1.0, s4, v0 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, 1.0, s6, v0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float 1.0, float %b, float %c, i1 %d) @@ -402,32 +402,32 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x58 +; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x58 ; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x34 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0 ; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_1: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x58 +; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x58 ; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x34 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x2c +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x2c ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, 1.0, v0 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, 1.0, v0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float 1.0, float %c, i1 %d) @@ -475,32 +475,32 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0xb8 ; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x4c ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W32-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W32-NEXT: v_cmp_ne_u32_e64 vcc_lo, 0, s0 -; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, v0, 1.0 ; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm ; ; GFX10_W64-LABEL: test_div_fmas_f32_inline_imm_2: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0xb8 +; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0xb8 ; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x4c ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: s_and_b32 s0, 1, s6 +; GFX10_W64-NEXT: s_and_b32 s0, 1, s4 ; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 ; GFX10_W64-NEXT: v_cmp_ne_u32_e64 vcc, 0, s0 -; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, 1.0 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, v0, 1.0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm %result = call float @llvm.amdgcn.div.fmas.f32(float %a, float %b, float 1.0, i1 %d) @@ -703,15 +703,15 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x4c ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: s_mov_b32 vcc_lo, 0 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm @@ -719,15 +719,15 @@ ; GFX10_W64-LABEL: test_div_fmas_f32_imm_false_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x4c ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_mov_b64 vcc, 0 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm @@ -774,15 +774,15 @@ ; GFX10_W32-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W32: ; %bb.0: ; GFX10_W32-NEXT: s_clause 0x3 -; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x94 -; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W32-NEXT: s_load_dword s4, s[0:1], 0x70 +; GFX10_W32-NEXT: s_load_dword s5, s[0:1], 0x94 +; GFX10_W32-NEXT: s_load_dword s6, s[0:1], 0x4c ; GFX10_W32-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W32-NEXT: s_mov_b32 vcc_lo, -1 ; GFX10_W32-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W32-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W32-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W32-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W32-NEXT: v_mov_b32_e32 v0, s4 +; GFX10_W32-NEXT: v_mov_b32_e32 v1, s5 +; GFX10_W32-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX10_W32-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W32-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W32-NEXT: s_endpgm @@ -790,15 +790,15 @@ ; GFX10_W64-LABEL: test_div_fmas_f32_imm_true_cond_to_vcc: ; GFX10_W64: ; %bb.0: ; GFX10_W64-NEXT: s_clause 0x3 -; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x70 -; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x94 -; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x4c +; GFX10_W64-NEXT: s_load_dword s4, s[0:1], 0x70 +; GFX10_W64-NEXT: s_load_dword s5, s[0:1], 0x94 +; GFX10_W64-NEXT: s_load_dword s6, s[0:1], 0x4c ; GFX10_W64-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 ; GFX10_W64-NEXT: s_mov_b64 vcc, -1 ; GFX10_W64-NEXT: s_waitcnt lgkmcnt(0) -; GFX10_W64-NEXT: v_mov_b32_e32 v0, s5 -; GFX10_W64-NEXT: v_mov_b32_e32 v1, s6 -; GFX10_W64-NEXT: v_div_fmas_f32 v0, s4, v0, v1 +; GFX10_W64-NEXT: v_mov_b32_e32 v0, s4 +; GFX10_W64-NEXT: v_mov_b32_e32 v1, s5 +; GFX10_W64-NEXT: v_div_fmas_f32 v0, s6, v0, v1 ; GFX10_W64-NEXT: v_mov_b32_e32 v1, 0 ; GFX10_W64-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10_W64-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.amdgcn.update.dpp.ll @@ -20,14 +20,14 @@ ; GFX10-LABEL: dpp_test: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_clause 0x1 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x2c -; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x24 +; GFX10-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x2c +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x24 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-NEXT: v_mov_b32_e32 v0, s4 -; GFX10-NEXT: v_mov_b32_e32 v1, s5 +; GFX10-NEXT: v_mov_b32_e32 v0, s2 +; GFX10-NEXT: v_mov_b32_e32 v1, s3 ; GFX10-NEXT: v_mov_b32_dpp v0, v1 quad_perm:[1,0,0,0] row_mask:0x1 bank_mask:0x1 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: global_store_dword v1, v0, s[2:3] +; GFX10-NEXT: global_store_dword v1, v0, s[4:5] ; GFX10-NEXT: s_endpgm %tmp0 = call i32 @llvm.amdgcn.update.dpp.i32(i32 %in1, i32 %in2, i32 1, i32 1, i32 1, i1 false) store i32 %tmp0, i32 addrspace(1)* %out Index: llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/load-constant.96.ll @@ -18,52 +18,52 @@ ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off offset:1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[0:1], off offset:2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[0:1], off offset:3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[0:1], off offset:4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[0:1], off offset:5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[0:1], off offset:6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[0:1], off offset:7 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[0:1], off offset:8 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[0:1], off offset:9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[0:1], off offset:10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[0:1], off offset:11 -; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v[0:1], off offset:1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v[0:1], off offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v[0:1], off offset:3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v[0:1], off offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v[0:1], off offset:5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v[0:1], off offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v[0:1], off offset:7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v[0:1], off offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v[0:1], off offset:9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v[0:1], off offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v13, v[0:1], off offset:11 ; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s4, 0xff +; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s5, 8 ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v1, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v12, s5, v12 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v3, s5, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, s4, v11 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s4, v4 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s4, v10 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v11, 16, v11 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, s4, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, s4, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v8, s5, v8 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, s5, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 24, v10 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, v9, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v4 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, v1, v4 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v1, v1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, v3, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v12, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v13, s4, v12 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v5, v0, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v8, v9, s4, v8 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 16, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 24, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v4, v11, v10 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v8, v7, v6 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v5, v3, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v13, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v5, v6, s4, v7 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v8 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v8, v10, v0, v1 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v7, 24, v9 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 16, v11 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 24, v12 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v2, v3, v4 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v5, v6, v7 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v8, v9, v10 ; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align1: @@ -154,25 +154,25 @@ ; GFX9-NOUNALIGNED-LABEL: v_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[0:1], off -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[0:1], off offset:2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[0:1], off offset:4 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[0:1], off offset:6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[0:1], off offset:8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[0:1], off offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v[0:1], off +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v[0:1], off offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v[0:1], off offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v[0:1], off offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v[0:1], off offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v7, v[0:1], off offset:10 ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s4, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v6 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s4, v3 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s4, v5 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v1, 16, v1 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s4, v2 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v7, s4, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v5, s4, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v3, s4, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s4, v7 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v2, s4, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, s4, v1 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v6, s4, v3 ; GFX9-NOUNALIGNED-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-UNALIGNED-LABEL: v_load_constant_v3i32_align2: @@ -393,52 +393,52 @@ ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align1: ; GFX9-NOUNALIGNED: ; %bb.0: ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:1 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:2 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v0, s[0:1] offset:3 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v0, s[0:1] offset:4 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v0, s[0:1] offset:5 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v0, s[0:1] offset:6 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v0, s[0:1] offset:7 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v0, s[0:1] offset:8 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v0, s[0:1] offset:9 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v0, s[0:1] offset:10 -; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v0, s[0:1] offset:11 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v1, v0, s[0:1] +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v2, v0, s[0:1] offset:1 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v3, v0, s[0:1] offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v4, v0, s[0:1] offset:3 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v5, v0, s[0:1] offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v6, v0, s[0:1] offset:5 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v7, v0, s[0:1] offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v8, v0, s[0:1] offset:7 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v9, v0, s[0:1] offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v10, v0, s[0:1] offset:9 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v11, v0, s[0:1] offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ubyte v12, v0, s[0:1] offset:11 ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_movk_i32 s0, 0xff ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s1, 8 ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v13, 8 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(10) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v11, s1, v11 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v2, s1, v2 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(9) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v10, s0, v10 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(8) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v9, s0, v9 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v11, v12, s0, v11 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v1, s0, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(6) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v7, s1, v7 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v6, s1, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(5) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v6, v6, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v7, v7, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v5, v5, v0 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v10, 16, v10 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v8, v8, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v3 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v3, v13, v3 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_sdwa v10, v13, v10 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:BYTE_0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(1) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, v2, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v11, v11, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, v1, v0 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v3, v4, v0, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 24, v9 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v8, s0, v7 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 16, v6 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 24, v5 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 24, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v11, v10, v9 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v7, v6, v5 -; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v3, v2, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v12, v12, v0 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 24, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v4, v5, s0, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v7 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v7, v9, v0, v10 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v6, 24, v8 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v8, 16, v11 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v9, 24, v12 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v0, v1, v2, v3 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v1, v4, v5, v6 +; GFX9-NOUNALIGNED-NEXT: v_or3_b32 v2, v7, v8, v9 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 @@ -538,25 +538,25 @@ ; GFX9-NOUNALIGNED-LABEL: s_load_constant_v3i32_align2: ; GFX9-NOUNALIGNED: ; %bb.0: ; GFX9-NOUNALIGNED-NEXT: v_mov_b32_e32 v0, 0 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v0, s[0:1] -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:2 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:4 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v0, s[0:1] offset:6 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v0, s[0:1] offset:8 -; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v0, s[0:1] offset:10 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v1, v0, s[0:1] +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v2, v0, s[0:1] offset:2 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v3, v0, s[0:1] offset:4 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v4, v0, s[0:1] offset:6 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v5, v0, s[0:1] offset:8 +; GFX9-NOUNALIGNED-NEXT: global_load_ushort v6, v0, s[0:1] offset:10 ; GFX9-NOUNALIGNED-NEXT: s_mov_b32 s0, 0xffff ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(4) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v5 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v0, s0, v2 ; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v0, 16, v0 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(2) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v3, s0, v3 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v3, 16, v3 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v2, s0, v4 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v2, 16, v2 ; GFX9-NOUNALIGNED-NEXT: s_waitcnt vmcnt(0) -; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v1, s0, v1 -; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v5, 16, v1 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v4, s0, v3 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v2, s0, v5 -; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v6, s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_b32_e32 v4, s0, v6 +; GFX9-NOUNALIGNED-NEXT: v_lshlrev_b32_e32 v4, 16, v4 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v0, v1, s0, v0 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v1, v3, s0, v2 +; GFX9-NOUNALIGNED-NEXT: v_and_or_b32 v2, v5, s0, v4 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s0, v0 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s1, v1 ; GFX9-NOUNALIGNED-NEXT: v_readfirstlane_b32 s2, v2 Index: llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -109,15 +109,15 @@ ; GFX9-NEXT: s_getpc_b64 s[2:3] ; GFX9-NEXT: s_add_u32 s2, s2, gv3@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s3, s3, gv3@gotpcrel32@hi+12 -; GFX9-NEXT: s_load_dwordx2 s[6:7], s[0:1], 0x0 -; GFX9-NEXT: s_load_dwordx2 s[4:5], s[2:3], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 +; GFX9-NEXT: s_load_dwordx2 s[6:7], s[2:3], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 1 ; GFX9-NEXT: s_mov_b32 s0, 0 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_store_dword v0, v0, s[6:7] +; GFX9-NEXT: global_store_dword v0, v0, s[4:5] ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_store_dword v0, v1, s[4:5] +; GFX9-NEXT: global_store_dword v0, v1, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: BB1_2: ; %Flow ; GFX9-NEXT: s_xor_b32 s0, s0, -1 Index: llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll +++ llvm/test/CodeGen/AMDGPU/disable_form_clauses.ll @@ -1,10 +1,9 @@ ; RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -stop-after=si-form-memory-clauses < %s | FileCheck -check-prefix=GCN %s ; GCN-LABEL: {{^}}name:{{[ ]*}}vector_clause -; GCN: BUNDLE +; GCN: LOAD_DWORDX2 ; GCN-NEXT: LOAD_DWORDX2 -; GCN-NEXT: LOAD_DWORDX2 -; GCN-NEXT: {{^ *[}]}} +; GCN-NEXT: KILL define amdgpu_kernel void @vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -33,6 +32,7 @@ ; GCN-LABEL: {{^}}name:{{[ ]*}}no_vector_clause ; GCN-NOT: BUNDLE +; GCN-NOT: KILL define amdgpu_kernel void @no_vector_clause(<4 x i32> addrspace(1)* noalias nocapture readonly %arg, <4 x i32> addrspace(1)* noalias nocapture %arg1) #0 { bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() Index: llvm/test/CodeGen/AMDGPU/ds_write2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/ds_write2.ll +++ llvm/test/CodeGen/AMDGPU/ds_write2.ll @@ -200,14 +200,16 @@ ; GFX9-LABEL: simple_write2_two_val_subreg2_mixed_f32: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x2c -; GFX9-NEXT: v_lshlrev_b32_e32 v5, 3, v0 +; GFX9-NEXT: v_lshlrev_b32_e32 v4, 3, v0 ; GFX9-NEXT: v_lshlrev_b32_e32 v0, 2, v0 +; GFX9-NEXT: ; kill: killed $vgpr4 +; GFX9-NEXT: ; kill: killed $sgpr0_sgpr1 ; GFX9-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[1:2], v5, s[0:1] glc +; GFX9-NEXT: global_load_dwordx2 v[1:2], v4, s[0:1] glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: global_load_dwordx2 v[3:4], v5, s[0:1] offset:8 glc +; GFX9-NEXT: global_load_dwordx2 v[2:3], v4, s[0:1] offset:8 glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: ds_write2_b32 v0, v1, v4 offset1:8 +; GFX9-NEXT: ds_write2_b32 v0, v1, v3 offset1:8 ; GFX9-NEXT: s_endpgm %x.i = tail call i32 @llvm.amdgcn.workitem.id.x() #1 %in.gep.0 = getelementptr <2 x float>, <2 x float> addrspace(1)* %in, i32 %x.i Index: llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -2709,8 +2709,9 @@ ; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 ; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32@rel32@hi+12 @@ -2792,14 +2793,15 @@ ; GFX9-NEXT: global_load_dwordx4 v[16:19], v28, s[4:5] offset:64 ; GFX9-NEXT: global_load_dwordx4 v[20:23], v28, s[4:5] offset:80 ; GFX9-NEXT: global_load_dwordx4 v[24:27], v28, s[4:5] offset:96 -; GFX9-NEXT: s_add_u32 s32, s32, 0x400 -; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_nop 0 ; GFX9-NEXT: global_load_dwordx4 v[28:31], v28, s[4:5] offset:112 -; GFX9-NEXT: s_waitcnt vmcnt(7) -; GFX9-NEXT: global_load_dword v32, v[0:1], off +; GFX9-NEXT: s_add_u32 s32, s32, 0x400 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_v32i32_i32@rel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, external_void_func_v32i32_i32@rel32@hi+12 +; GFX9-NEXT: v_writelane_b32 v40, s31, 1 +; GFX9-NEXT: s_waitcnt vmcnt(7) +; GFX9-NEXT: global_load_dword v32, v[0:1], off ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] Index: llvm/test/CodeGen/AMDGPU/idot2.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot2.ll +++ llvm/test/CodeGen/AMDGPU/idot2.ll @@ -89,12 +89,12 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -105,12 +105,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s8, s1, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -358,12 +358,12 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot2_i32_i16 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot2_i32_i16 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -374,12 +374,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_dot2_i32_i16 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot2_i32_i16 v0, s8, s1, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -505,16 +505,16 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-DL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s0, v0 +; GFX10-DL-NEXT: s_sext_i32_i16 s0, s1 +; GFX10-DL-NEXT: s_sext_i32_i16 s1, s8 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -623,12 +623,12 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -639,12 +639,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s8, s1, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -770,16 +770,16 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 -; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-DL-NEXT: s_and_b32 s1, s1, 0xffff -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_ashr_i32 s0, s1, 16 +; GFX10-DL-NEXT: s_ashr_i32 s4, s8, 16 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s0, v0 +; GFX10-DL-NEXT: s_sext_i32_i16 s0, s1 +; GFX10-DL-NEXT: s_and_b32 s1, s8, 0xffff ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -853,15 +853,15 @@ ; GFX9-NODL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-NODL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-NODL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-NODL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-NODL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-NODL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-NODL-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX9-NODL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-NODL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-NODL-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s1, s1, v1 +; GFX9-NODL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-NODL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-NODL-NEXT: s_and_b32 s4, s8, 0xffff ; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s0, s0, v1 +; GFX9-NODL-NEXT: v_mad_u32_u24 v1, s4, s4, v1 ; GFX9-NODL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-NODL-NEXT: s_endpgm ; @@ -871,15 +871,15 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s8 -; GFX9-DL-NEXT: s_and_b32 s0, s0, 0xffff -; GFX9-DL-NEXT: v_mad_u32_u24 v1, s1, s1, v1 +; GFX9-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 +; GFX9-DL-NEXT: s_and_b32 s4, s8, 0xffff ; GFX9-DL-NEXT: v_mad_u32_u24 v1, s0, s0, v1 +; GFX9-DL-NEXT: v_mad_u32_u24 v1, s4, s4, v1 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -890,13 +890,13 @@ ; GFX10-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s1, s8 -; GFX10-DL-NEXT: s_and_b32 s0, s0, 0xffff +; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s0, s1 +; GFX10-DL-NEXT: s_and_b32 s0, s8, 0xffff ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s0, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -1005,12 +1005,12 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -1021,12 +1021,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s8, s1, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1134,12 +1134,12 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x4 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot2_u32_u16 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -1150,12 +1150,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x4 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x4 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x4 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x4 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot2_u32_u16 v0, s8, s1, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i16> addrspace(1)* %src2, @@ -1570,17 +1570,17 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_lshr_b32 s5, s0, 16 -; GFX10-DL-NEXT: s_and_b32 s6, s1, s4 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s4 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s5, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_mov_b32 s0, 0xffff +; GFX10-DL-NEXT: s_lshr_b32 s4, s1, 16 +; GFX10-DL-NEXT: s_and_b32 s5, s8, s0 +; GFX10-DL-NEXT: s_and_b32 s0, s1, s0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: s_lshr_b32 s1, s8, 16 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -1716,17 +1716,17 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 -; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-DL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s0, v0 +; GFX10-DL-NEXT: s_and_b32 s0, s1, s5 +; GFX10-DL-NEXT: s_and_b32 s1, s8, s5 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] @@ -1860,16 +1860,16 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 -; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_ashr_i32 s0, s1, 16 +; GFX10-DL-NEXT: s_ashr_i32 s4, s8, 16 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s0, v0 +; GFX10-DL-NEXT: s_sext_i32_i16 s0, s1 +; GFX10-DL-NEXT: s_sext_i32_i16 s1, s8 ; GFX10-DL-NEXT: v_mad_i32_i24 v1, s1, s0, v0 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] @@ -2008,19 +2008,19 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_mov_b32 s4, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 -; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_and_b32 s0, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s8, s4 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: s_lshr_b32 s5, s8, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s0, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s1, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s0, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2153,18 +2153,18 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_sext_i32_i16 s4, s0 -; GFX10-DL-NEXT: s_sext_i32_i16 s5, s1 -; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 16 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_sext_i32_i16 s0, s1 +; GFX10-DL-NEXT: s_sext_i32_i16 s4, s8 ; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 16 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: s_ashr_i32 s5, s8, 16 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s0, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s1, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s0, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <2 x i16> addrspace(1)* %src2, @@ -2302,18 +2302,18 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX10-DL-NEXT: s_mov_b32 s6, 0xffff +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 +; GFX10-DL-NEXT: s_mov_b32 s5, 0xffff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_lshr_b32 s4, s0, 16 -; GFX10-DL-NEXT: s_lshr_b32 s5, s1, 16 -; GFX10-DL-NEXT: s_and_b32 s0, s0, s6 -; GFX10-DL-NEXT: s_and_b32 s1, s1, s6 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 16 +; GFX10-DL-NEXT: s_lshr_b32 s4, s8, 16 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s0, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s0, v0 +; GFX10-DL-NEXT: s_and_b32 s0, s1, s5 +; GFX10-DL-NEXT: s_and_b32 s1, s8, s5 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s1, s0, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -2447,17 +2447,17 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_ashr_i32 s4, s0, 16 -; GFX10-DL-NEXT: s_ashr_i32 s5, s1, 16 -; GFX10-DL-NEXT: s_sext_i32_i16 s0, s0 -; GFX10-DL-NEXT: s_sext_i32_i16 s1, s1 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s4, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_ashr_i32 s0, s1, 16 +; GFX10-DL-NEXT: s_ashr_i32 s4, s8, 16 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s0, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s0, v0 +; GFX10-DL-NEXT: s_sext_i32_i16 s0, s1 +; GFX10-DL-NEXT: s_sext_i32_i16 s1, s8 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s1, s0, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/idot4s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot4s.ll +++ llvm/test/CodeGen/AMDGPU/idot4s.ll @@ -105,13 +105,13 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s0, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot4_i32_i8 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -122,12 +122,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_dot4_i32_i8 v0, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot4_i32_i8 v0, s1, s8, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -631,23 +631,23 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_sext_i32_i8 s4, s0 -; GFX10-DL-NEXT: s_sext_i32_i8 s5, s1 -; GFX10-DL-NEXT: s_bfe_i32 s6, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_i32 s7, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 -; GFX10-DL-NEXT: s_bfe_i32 s4, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80010 -; GFX10-DL-NEXT: s_ashr_i32 s0, s0, 24 -; GFX10-DL-NEXT: s_ashr_i32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_i32_i24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_sext_i32_i8 s0, s1 +; GFX10-DL-NEXT: s_sext_i32_i8 s4, s8 +; GFX10-DL-NEXT: s_bfe_i32 s5, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_i32 s6, s8, 0x80008 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s4, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s4, v0 +; GFX10-DL-NEXT: s_bfe_i32 s0, s1, 0x80010 +; GFX10-DL-NEXT: s_bfe_i32 s4, s8, 0x80010 +; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s4, v0 +; GFX10-DL-NEXT: s_ashr_i32 s0, s1, 24 +; GFX10-DL-NEXT: s_ashr_i32 s1, s8, 24 ; GFX10-DL-NEXT: v_mad_i32_i24 v0, s0, s1, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/idot4u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot4u.ll +++ llvm/test/CodeGen/AMDGPU/idot4u.ll @@ -108,13 +108,13 @@ ; GFX9-DL-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x34 ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s0, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot4_u32_u8 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -125,12 +125,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_dot4_u32_u8 v0, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot4_u32_u8 v0, s1, s8, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <4 x i8> addrspace(1)* %src2, @@ -598,16 +598,16 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[0:3], s[0:1], 0x24 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-DL-NEXT: global_load_ubyte v1, v0, s[4:5] -; GFX10-DL-NEXT: s_load_dword s7, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s6, s[0:1], 0x0 +; GFX10-DL-NEXT: s_load_dword s6, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s7, s[0:1], 0x0 ; GFX10-DL-NEXT: s_movk_i32 s0, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_and_b32 s1, s7, s0 -; GFX10-DL-NEXT: s_and_b32 s0, s6, s0 +; GFX10-DL-NEXT: s_and_b32 s1, s6, s0 +; GFX10-DL-NEXT: s_and_b32 s0, s7, s0 ; GFX10-DL-NEXT: s_waitcnt vmcnt(0) ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 -; GFX10-DL-NEXT: s_bfe_u32 s0, s7, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s1, s6, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s0, s6, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s1, s7, 0x80008 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s0, v1 ; GFX10-DL-NEXT: global_store_byte v0, v1, s[4:5] ; GFX10-DL-NEXT: s_endpgm @@ -1113,24 +1113,24 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_movk_i32 s4, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_and_b32 s5, s0, s4 -; GFX10-DL-NEXT: s_and_b32 s4, s1, s4 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x80008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s6, s7, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s4, v0 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 -; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_and_b32 s0, s1, s4 +; GFX10-DL-NEXT: s_and_b32 s4, s8, s4 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s6, s8, 0x80008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s5, s6, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s4, v0 +; GFX10-DL-NEXT: s_bfe_u32 s0, s1, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s4, s8, 0x80010 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s4, v0 +; GFX10-DL-NEXT: s_lshr_b32 s0, s1, 24 +; GFX10-DL-NEXT: s_lshr_b32 s1, s8, 24 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s1, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -1317,25 +1317,25 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_movk_i32 s6, 0xff ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80008 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80008 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80008 +; GFX10-DL-NEXT: s_bfe_u32 s5, s8, 0x80008 ; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 -; GFX10-DL-NEXT: s_and_b32 s4, s0, s6 -; GFX10-DL-NEXT: s_and_b32 s5, s1, s6 +; GFX10-DL-NEXT: s_and_b32 s4, s1, s6 +; GFX10-DL-NEXT: s_and_b32 s5, s8, s6 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v0 -; GFX10-DL-NEXT: s_bfe_u32 s4, s0, 0x80010 -; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x80010 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 24 +; GFX10-DL-NEXT: s_bfe_u32 s4, s1, 0x80010 +; GFX10-DL-NEXT: s_bfe_u32 s5, s8, 0x80010 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 24 +; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s0, v0 ; GFX10-DL-NEXT: v_mad_u32_u24 v1, s4, s5, v1 -; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, s8, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: s_lshr_b32 s4, s8, 24 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s4, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v1, v0 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm Index: llvm/test/CodeGen/AMDGPU/idot8s.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8s.ll +++ llvm/test/CodeGen/AMDGPU/idot8s.ll @@ -179,13 +179,13 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s0, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -202,12 +202,12 @@ ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v0, s1, s8, v0 ; GFX10-DL-XNACK-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-XNACK-NEXT: s_endpgm ; @@ -1491,35 +1491,35 @@ ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40000 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40000 -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, s4, s5, v0 -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v0 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40004 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40004 -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40008 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40008 -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x4000c -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x4000c -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40010 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40010 -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40014 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40014 -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s0, 0x40018 -; GFX10-DL-XNACK-NEXT: s_bfe_i32 s5, s1, 0x40018 -; GFX10-DL-XNACK-NEXT: s_ashr_i32 s0, s0, 28 -; GFX10-DL-XNACK-NEXT: s_ashr_i32 s1, s1, 28 -; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s4, s5, v1 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s1, 0x40000 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s8, 0x40000 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v0, s0, s4, v0 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s4, v0 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s1, 0x40004 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s8, 0x40004 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s4, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s1, 0x40008 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s8, 0x40008 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s4, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s1, 0x4000c +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s8, 0x4000c +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s4, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s1, 0x40010 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s8, 0x40010 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s4, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s1, 0x40014 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s8, 0x40014 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s4, v1 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s0, s1, 0x40018 +; GFX10-DL-XNACK-NEXT: s_bfe_i32 s4, s8, 0x40018 +; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s4, v1 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s0, s1, 28 +; GFX10-DL-XNACK-NEXT: s_ashr_i32 s1, s8, 28 ; GFX10-DL-XNACK-NEXT: v_mad_i32_i24 v1, s0, s1, v1 ; GFX10-DL-XNACK-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-DL-XNACK-NEXT: global_store_dword v2, v0, s[2:3] @@ -1859,13 +1859,13 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s0, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot8_i32_i4 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -1882,12 +1882,12 @@ ; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-XNACK-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-XNACK-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-XNACK-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v0, s0, s1, v0 +; GFX10-DL-XNACK-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-XNACK-NEXT: v_dot8_i32_i4 v0, s1, s8, v0 ; GFX10-DL-XNACK-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-XNACK-NEXT: s_endpgm ; Index: llvm/test/CodeGen/AMDGPU/idot8u.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/idot8u.ll +++ llvm/test/CodeGen/AMDGPU/idot8u.ll @@ -178,13 +178,13 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s0, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -201,12 +201,12 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s1, s8, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -1817,36 +1817,36 @@ ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: s_and_b32 s4, s0, 15 -; GFX10-DL-NEXT: s_and_b32 s5, s1, 15 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40004 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40004 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v0 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40008 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40008 -; GFX10-DL-NEXT: v_mad_u32_u24 v0, s4, s5, v0 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x4000c -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x4000c -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40010 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40010 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40014 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40014 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 -; GFX10-DL-NEXT: s_bfe_u32 s6, s0, 0x40018 -; GFX10-DL-NEXT: s_bfe_u32 s7, s1, 0x40018 -; GFX10-DL-NEXT: s_lshr_b32 s0, s0, 28 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: s_and_b32 s0, s1, 15 +; GFX10-DL-NEXT: s_and_b32 s4, s8, 15 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40004 +; GFX10-DL-NEXT: s_bfe_u32 s6, s8, 0x40004 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s6, v0 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40008 +; GFX10-DL-NEXT: s_bfe_u32 s6, s8, 0x40008 +; GFX10-DL-NEXT: v_mad_u32_u24 v0, s0, s4, v0 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s6, v1 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x4000c +; GFX10-DL-NEXT: s_bfe_u32 s6, s8, 0x4000c +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s6, v1 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40010 +; GFX10-DL-NEXT: s_bfe_u32 s6, s8, 0x40010 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s6, v1 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40014 +; GFX10-DL-NEXT: s_bfe_u32 s6, s8, 0x40014 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s6, v1 +; GFX10-DL-NEXT: s_bfe_u32 s5, s1, 0x40018 +; GFX10-DL-NEXT: s_bfe_u32 s6, s8, 0x40018 ; GFX10-DL-NEXT: s_lshr_b32 s1, s1, 28 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s6, s7, v1 -; GFX10-DL-NEXT: v_mad_u32_u24 v1, s0, s1, v1 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s5, s6, v1 +; GFX10-DL-NEXT: s_lshr_b32 s5, s8, 28 +; GFX10-DL-NEXT: v_mad_u32_u24 v1, s1, s5, v1 ; GFX10-DL-NEXT: v_add_nc_u32_e32 v0, v0, v1 ; GFX10-DL-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm @@ -2092,13 +2092,13 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 +; GFX9-DL-NEXT: s_load_dword s0, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[4:5], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX9-DL-NEXT: v_mov_b32_e32 v1, s1 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s0, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -2115,12 +2115,12 @@ ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_addc_u32 s13, s13, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s0, s1, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s1, s8, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm <8 x i4> addrspace(1)* %src2, @@ -3332,12 +3332,12 @@ ; GFX9-DL-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX9-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX9-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX9-DL-NEXT: s_load_dword s1, s[2:3], 0x0 +; GFX9-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX9-DL-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-DL-NEXT: v_mov_b32_e32 v1, s0 -; GFX9-DL-NEXT: v_mov_b32_e32 v2, s8 -; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s1, v1, v2 +; GFX9-DL-NEXT: v_mov_b32_e32 v2, s1 +; GFX9-DL-NEXT: v_dot8_u32_u4 v1, s8, v1, v2 ; GFX9-DL-NEXT: global_store_dword v0, v1, s[2:3] ; GFX9-DL-NEXT: s_endpgm ; @@ -3348,12 +3348,12 @@ ; GFX10-DL-NEXT: s_load_dwordx4 s[4:7], s[0:1], 0x24 ; GFX10-DL-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: s_load_dword s8, s[2:3], 0x0 -; GFX10-DL-NEXT: s_load_dword s0, s[4:5], 0x0 -; GFX10-DL-NEXT: s_load_dword s1, s[6:7], 0x0 +; GFX10-DL-NEXT: s_load_dword s0, s[2:3], 0x0 +; GFX10-DL-NEXT: s_load_dword s1, s[4:5], 0x0 +; GFX10-DL-NEXT: s_load_dword s8, s[6:7], 0x0 ; GFX10-DL-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-DL-NEXT: v_mov_b32_e32 v0, s8 -; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s1, s0, v0 +; GFX10-DL-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-DL-NEXT: v_dot8_u32_u4 v0, s8, s1, v0 ; GFX10-DL-NEXT: global_store_dword v1, v0, s[2:3] ; GFX10-DL-NEXT: s_endpgm i32 addrspace(1)* %v2addr, Index: llvm/test/CodeGen/AMDGPU/limit-soft-clause-reg-pressure.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/limit-soft-clause-reg-pressure.mir +++ llvm/test/CodeGen/AMDGPU/limit-soft-clause-reg-pressure.mir @@ -15,16 +15,16 @@ occupancy: 6 body: | ; CHECK-LABEL: name: soft_clause_bundle_out_of_registers - ; CHECK: early-clobber %4:sgpr_512, early-clobber %5:sgpr_512 = BUNDLE %3 { - ; CHECK: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 0, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX16_IMM1:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4096, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: } - ; CHECK: [[S_LOAD_DWORDX16_IMM2:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8192, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX16_IMM3:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 12288, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX16_IMM4:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 64, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX16_IMM5:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4160, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX16_IMM6:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8256, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: dead $sgpr30_sgpr31 = SI_CALL undef $sgpr4_sgpr5, 0, csr_amdgpu_highregs, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr70, implicit-def $sgpr80, implicit-def $sgpr90, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 + ; CHECK: bb.0: + ; CHECK: successors: %bb.1(0x80000000) + ; CHECK: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 0, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM1:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4096, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM2:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8192, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM3:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 12288, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM4:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 64, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM5:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 4160, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM6:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %3, 8256, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: dead $sgpr30_sgpr31 = SI_CALL undef $sgpr4_sgpr5, 0, csr_amdgpu_highregs, implicit-def $vgpr0, implicit-def $vgpr1, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3_sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23_sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, implicit-def $sgpr32_sgpr33_sgpr34_sgpr35_sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43_sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51_sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59_sgpr60_sgpr61_sgpr62_sgpr63, implicit-def $sgpr70, implicit-def $sgpr80, implicit-def $sgpr90, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31, implicit-def $vgpr32_vgpr33_vgpr34_vgpr35_vgpr36_vgpr37_vgpr38_vgpr39_vgpr40_vgpr41_vgpr42_vgpr43_vgpr44_vgpr45_vgpr46_vgpr47_vgpr48_vgpr49_vgpr50_vgpr51_vgpr52_vgpr53_vgpr54_vgpr55_vgpr56_vgpr57_vgpr58_vgpr59_vgpr60_vgpr61_vgpr62_vgpr63 bb.0: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6 @@ -118,29 +118,11 @@ bb.0: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6 ; CHECK-LABEL: name: simple_huge_reg_tuple_clause - ; CHECK: liveins: $vgpr0, $sgpr4_sgpr5, $sgpr6 - ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 - ; CHECK: [[S_MOV_B64_:%[0-9]+]]:sreg_64 = S_MOV_B64 0 - ; CHECK: [[S_MOV_B64_1:%[0-9]+]]:sreg_64 = S_MOV_B64 1 - ; CHECK: [[S_MOV_B64_2:%[0-9]+]]:sreg_64 = S_MOV_B64 2 - ; CHECK: [[S_MOV_B64_3:%[0-9]+]]:sreg_64 = S_MOV_B64 3 - ; CHECK: [[S_MOV_B64_4:%[0-9]+]]:sreg_64 = S_MOV_B64 4 - ; CHECK: [[S_MOV_B64_5:%[0-9]+]]:sreg_64 = S_MOV_B64 5 - ; CHECK: [[S_MOV_B64_6:%[0-9]+]]:sreg_64 = S_MOV_B64 6 - ; CHECK: early-clobber %9:sgpr_512, early-clobber %8:sgpr_512 = BUNDLE [[COPY]] { - ; CHECK: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 0, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX16_IMM1:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 64, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: } - ; CHECK: [[S_LOAD_DWORDX16_IMM2:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 4096, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX16_IMM3:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM [[COPY]], 4160, 0, 0 :: (load 64, align 4, addrspace 4) - ; CHECK: S_NOP 0, implicit [[S_LOAD_DWORDX16_IMM]] - ; CHECK: S_NOP 0, implicit [[S_LOAD_DWORDX16_IMM1]] - ; CHECK: S_NOP 0, implicit [[S_LOAD_DWORDX16_IMM2]] - ; CHECK: S_NOP 0, implicit [[S_LOAD_DWORDX16_IMM3]] - ; CHECK: S_NOP 0, implicit [[S_MOV_B64_]], implicit [[S_MOV_B64_1]], implicit [[S_MOV_B64_2]], implicit [[S_MOV_B64_3]], implicit [[S_MOV_B64_4]] - ; CHECK: S_NOP 0, implicit [[S_MOV_B64_5]] - ; CHECK: S_NOP 0, implicit [[S_MOV_B64_6]] - ; CHECK: S_ENDPGM 0 + ; CHECK: [[S_LOAD_DWORDX16_IMM:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 0, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM1:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 64, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM2:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 4096, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: [[S_LOAD_DWORDX16_IMM3:%[0-9]+]]:sgpr_512 = S_LOAD_DWORDX16_IMM %0, 4160, 0, 0 :: (load 64, align 4, addrspace 4) + ; CHECK-NEXT: S_NOP 0, implicit [[S_LOAD_DWORDX16_IMM]] %0:sreg_64 = COPY $sgpr4_sgpr5 %1:sreg_64 = S_MOV_B64 0 %2:sreg_64 = S_MOV_B64 1 Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-agent.ll @@ -1219,25 +1219,25 @@ ; GFX10-WGP-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_monotonic_monotonic_cmpxchg: @@ -1292,13 +1292,13 @@ ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1308,13 +1308,13 @@ ; GFX10-CU-LABEL: global_agent_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1372,29 +1372,29 @@ ; GFX10-WGP-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_release_monotonic_cmpxchg: @@ -1452,15 +1452,15 @@ ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1470,15 +1470,15 @@ ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1541,15 +1541,15 @@ ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1559,15 +1559,15 @@ ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1628,13 +1628,13 @@ ; GFX10-WGP-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1644,13 +1644,13 @@ ; GFX10-CU-LABEL: global_agent_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1712,15 +1712,15 @@ ; GFX10-WGP-LABEL: global_agent_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1730,15 +1730,15 @@ ; GFX10-CU-LABEL: global_agent_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1801,15 +1801,15 @@ ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1819,15 +1819,15 @@ ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1890,15 +1890,15 @@ ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1908,15 +1908,15 @@ ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1979,15 +1979,15 @@ ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1997,15 +1997,15 @@ ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2070,33 +2070,33 @@ ; GFX10-WGP-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_monotonic_ret_cmpxchg: @@ -2161,37 +2161,37 @@ ; GFX10-WGP-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_monotonic_ret_cmpxchg: @@ -2257,37 +2257,37 @@ ; GFX10-WGP-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_monotonic_ret_cmpxchg: @@ -2351,33 +2351,33 @@ ; GFX10-WGP-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acquire_acquire_ret_cmpxchg: @@ -2442,37 +2442,37 @@ ; GFX10-WGP-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_release_acquire_ret_cmpxchg: @@ -2538,37 +2538,37 @@ ; GFX10-WGP-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_acq_rel_acquire_ret_cmpxchg: @@ -2634,37 +2634,37 @@ ; GFX10-WGP-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_acquire_ret_cmpxchg: @@ -2730,37 +2730,37 @@ ; GFX10-WGP-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_seq_cst_seq_cst_ret_cmpxchg: @@ -3994,25 +3994,25 @@ ; GFX10-WGP-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_monotonic_monotonic_cmpxchg: @@ -4067,13 +4067,13 @@ ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4082,13 +4082,13 @@ ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4145,29 +4145,29 @@ ; GFX10-WGP-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_monotonic_cmpxchg: @@ -4225,15 +4225,15 @@ ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4242,15 +4242,15 @@ ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4312,15 +4312,15 @@ ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4329,15 +4329,15 @@ ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4397,13 +4397,13 @@ ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4412,13 +4412,13 @@ ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4479,15 +4479,15 @@ ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4496,15 +4496,15 @@ ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4566,15 +4566,15 @@ ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4583,15 +4583,15 @@ ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4653,15 +4653,15 @@ ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4670,15 +4670,15 @@ ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4740,15 +4740,15 @@ ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4757,15 +4757,15 @@ ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4829,33 +4829,33 @@ ; GFX10-WGP-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_monotonic_ret_cmpxchg: @@ -4920,37 +4920,37 @@ ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -5016,37 +5016,37 @@ ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -5110,33 +5110,33 @@ ; GFX10-WGP-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acquire_acquire_ret_cmpxchg: @@ -5201,37 +5201,37 @@ ; GFX10-WGP-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_release_acquire_ret_cmpxchg: @@ -5297,37 +5297,37 @@ ; GFX10-WGP-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_acq_rel_acquire_ret_cmpxchg: @@ -5393,37 +5393,37 @@ ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_acquire_ret_cmpxchg: @@ -5489,37 +5489,37 @@ ; GFX10-WGP-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_agent_one_as_seq_cst_seq_cst_ret_cmpxchg: Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-singlethread.ll @@ -1096,25 +1096,25 @@ ; GFX10-WGP-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_monotonic_monotonic_cmpxchg: @@ -1165,25 +1165,25 @@ ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_cmpxchg: @@ -1234,25 +1234,25 @@ ; GFX10-WGP-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_monotonic_cmpxchg: @@ -1303,25 +1303,25 @@ ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_cmpxchg: @@ -1372,25 +1372,25 @@ ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_cmpxchg: @@ -1441,25 +1441,25 @@ ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_cmpxchg: @@ -1510,25 +1510,25 @@ ; GFX10-WGP-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_cmpxchg: @@ -1579,25 +1579,25 @@ ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_cmpxchg: @@ -1648,25 +1648,25 @@ ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_cmpxchg: @@ -1717,25 +1717,25 @@ ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_cmpxchg: @@ -1792,29 +1792,29 @@ ; GFX10-WGP-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_monotonic_ret_cmpxchg: @@ -1875,29 +1875,29 @@ ; GFX10-WGP-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_monotonic_ret_cmpxchg: @@ -1958,29 +1958,29 @@ ; GFX10-WGP-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_monotonic_ret_cmpxchg: @@ -2041,29 +2041,29 @@ ; GFX10-WGP-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acquire_acquire_ret_cmpxchg: @@ -2124,29 +2124,29 @@ ; GFX10-WGP-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_release_acquire_ret_cmpxchg: @@ -2207,29 +2207,29 @@ ; GFX10-WGP-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_acq_rel_acquire_ret_cmpxchg: @@ -2290,29 +2290,29 @@ ; GFX10-WGP-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_acquire_ret_cmpxchg: @@ -2373,29 +2373,29 @@ ; GFX10-WGP-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_seq_cst_seq_cst_ret_cmpxchg: @@ -3511,25 +3511,25 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_monotonic_monotonic_cmpxchg: @@ -3580,25 +3580,25 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_cmpxchg: @@ -3649,25 +3649,25 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_monotonic_cmpxchg: @@ -3718,25 +3718,25 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_cmpxchg: @@ -3787,25 +3787,25 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_cmpxchg: @@ -3856,25 +3856,25 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_cmpxchg: @@ -3925,25 +3925,25 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_cmpxchg: @@ -3994,25 +3994,25 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_cmpxchg: @@ -4063,25 +4063,25 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_cmpxchg: @@ -4132,25 +4132,25 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_cmpxchg: @@ -4207,29 +4207,29 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_monotonic_ret_cmpxchg: @@ -4290,29 +4290,29 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -4373,29 +4373,29 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -4456,29 +4456,29 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acquire_acquire_ret_cmpxchg: @@ -4539,29 +4539,29 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_release_acquire_ret_cmpxchg: @@ -4622,29 +4622,29 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_acq_rel_acquire_ret_cmpxchg: @@ -4705,29 +4705,29 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_acquire_ret_cmpxchg: @@ -4788,29 +4788,29 @@ ; GFX10-WGP-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_singlethread_one_as_seq_cst_seq_cst_ret_cmpxchg: Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-system.ll @@ -1219,25 +1219,25 @@ ; GFX10-WGP-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_monotonic_monotonic_cmpxchg: @@ -1292,13 +1292,13 @@ ; GFX10-WGP-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1308,13 +1308,13 @@ ; GFX10-CU-LABEL: global_system_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1372,29 +1372,29 @@ ; GFX10-WGP-LABEL: global_system_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_release_monotonic_cmpxchg: @@ -1452,15 +1452,15 @@ ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1470,15 +1470,15 @@ ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1541,15 +1541,15 @@ ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1559,15 +1559,15 @@ ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1628,13 +1628,13 @@ ; GFX10-WGP-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1644,13 +1644,13 @@ ; GFX10-CU-LABEL: global_system_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1712,15 +1712,15 @@ ; GFX10-WGP-LABEL: global_system_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1730,15 +1730,15 @@ ; GFX10-CU-LABEL: global_system_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1801,15 +1801,15 @@ ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1819,15 +1819,15 @@ ; GFX10-CU-LABEL: global_system_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1890,15 +1890,15 @@ ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1908,15 +1908,15 @@ ; GFX10-CU-LABEL: global_system_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -1979,15 +1979,15 @@ ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1997,15 +1997,15 @@ ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv @@ -2070,33 +2070,33 @@ ; GFX10-WGP-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_monotonic_ret_cmpxchg: @@ -2161,37 +2161,37 @@ ; GFX10-WGP-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_monotonic_ret_cmpxchg: @@ -2257,37 +2257,37 @@ ; GFX10-WGP-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_monotonic_ret_cmpxchg: @@ -2351,33 +2351,33 @@ ; GFX10-WGP-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acquire_acquire_ret_cmpxchg: @@ -2442,37 +2442,37 @@ ; GFX10-WGP-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_release_acquire_ret_cmpxchg: @@ -2538,37 +2538,37 @@ ; GFX10-WGP-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_acq_rel_acquire_ret_cmpxchg: @@ -2634,37 +2634,37 @@ ; GFX10-WGP-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_acquire_ret_cmpxchg: @@ -2730,37 +2730,37 @@ ; GFX10-WGP-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_seq_cst_seq_cst_ret_cmpxchg: @@ -3994,25 +3994,25 @@ ; GFX10-WGP-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_monotonic_monotonic_cmpxchg: @@ -4067,13 +4067,13 @@ ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4082,13 +4082,13 @@ ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4145,29 +4145,29 @@ ; GFX10-WGP-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_monotonic_cmpxchg: @@ -4225,15 +4225,15 @@ ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4242,15 +4242,15 @@ ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4312,15 +4312,15 @@ ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4329,15 +4329,15 @@ ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4397,13 +4397,13 @@ ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4412,13 +4412,13 @@ ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4479,15 +4479,15 @@ ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4496,15 +4496,15 @@ ; GFX10-CU-LABEL: global_system_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4566,15 +4566,15 @@ ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4583,15 +4583,15 @@ ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4653,15 +4653,15 @@ ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4670,15 +4670,15 @@ ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4740,15 +4740,15 @@ ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv @@ -4757,15 +4757,15 @@ ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv @@ -4829,33 +4829,33 @@ ; GFX10-WGP-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_monotonic_ret_cmpxchg: @@ -4920,37 +4920,37 @@ ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -5016,37 +5016,37 @@ ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -5110,33 +5110,33 @@ ; GFX10-WGP-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acquire_acquire_ret_cmpxchg: @@ -5201,37 +5201,37 @@ ; GFX10-WGP-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_release_acquire_ret_cmpxchg: @@ -5297,37 +5297,37 @@ ; GFX10-WGP-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_acq_rel_acquire_ret_cmpxchg: @@ -5393,37 +5393,37 @@ ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_acquire_ret_cmpxchg: @@ -5489,37 +5489,37 @@ ; GFX10-WGP-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: buffer_gl1_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) ; GFX10-CU-NEXT: buffer_gl0_inv ; GFX10-CU-NEXT: buffer_gl1_inv -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_system_one_as_seq_cst_seq_cst_ret_cmpxchg: Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-wavefront.ll @@ -1096,25 +1096,25 @@ ; GFX10-WGP-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_monotonic_monotonic_cmpxchg: @@ -1165,25 +1165,25 @@ ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_cmpxchg: @@ -1234,25 +1234,25 @@ ; GFX10-WGP-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_monotonic_cmpxchg: @@ -1303,25 +1303,25 @@ ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_cmpxchg: @@ -1372,25 +1372,25 @@ ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_cmpxchg: @@ -1441,25 +1441,25 @@ ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_cmpxchg: @@ -1510,25 +1510,25 @@ ; GFX10-WGP-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_cmpxchg: @@ -1579,25 +1579,25 @@ ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_cmpxchg: @@ -1648,25 +1648,25 @@ ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_cmpxchg: @@ -1717,25 +1717,25 @@ ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_cmpxchg: @@ -1792,29 +1792,29 @@ ; GFX10-WGP-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_monotonic_ret_cmpxchg: @@ -1875,29 +1875,29 @@ ; GFX10-WGP-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_monotonic_ret_cmpxchg: @@ -1958,29 +1958,29 @@ ; GFX10-WGP-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_monotonic_ret_cmpxchg: @@ -2041,29 +2041,29 @@ ; GFX10-WGP-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acquire_acquire_ret_cmpxchg: @@ -2124,29 +2124,29 @@ ; GFX10-WGP-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_release_acquire_ret_cmpxchg: @@ -2207,29 +2207,29 @@ ; GFX10-WGP-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_acq_rel_acquire_ret_cmpxchg: @@ -2290,29 +2290,29 @@ ; GFX10-WGP-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_acquire_ret_cmpxchg: @@ -2373,29 +2373,29 @@ ; GFX10-WGP-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_seq_cst_seq_cst_ret_cmpxchg: @@ -3511,25 +3511,25 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_monotonic_monotonic_cmpxchg: @@ -3580,25 +3580,25 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_cmpxchg: @@ -3649,25 +3649,25 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_monotonic_cmpxchg: @@ -3718,25 +3718,25 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_cmpxchg: @@ -3787,25 +3787,25 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_cmpxchg: @@ -3856,25 +3856,25 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_cmpxchg: @@ -3925,25 +3925,25 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_cmpxchg: @@ -3994,25 +3994,25 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_cmpxchg: @@ -4063,25 +4063,25 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_cmpxchg: @@ -4132,25 +4132,25 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_cmpxchg: @@ -4207,29 +4207,29 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_monotonic_ret_cmpxchg: @@ -4290,29 +4290,29 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -4373,29 +4373,29 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -4456,29 +4456,29 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acquire_acquire_ret_cmpxchg: @@ -4539,29 +4539,29 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_release_acquire_ret_cmpxchg: @@ -4622,29 +4622,29 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_acq_rel_acquire_ret_cmpxchg: @@ -4705,29 +4705,29 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_acquire_ret_cmpxchg: @@ -4788,29 +4788,29 @@ ; GFX10-WGP-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_wavefront_one_as_seq_cst_seq_cst_ret_cmpxchg: Index: llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll +++ llvm/test/CodeGen/AMDGPU/memory-legalizer-global-workgroup.ll @@ -1168,25 +1168,25 @@ ; GFX10-WGP-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_monotonic_monotonic_cmpxchg: @@ -1239,13 +1239,13 @@ ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1254,13 +1254,13 @@ ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1315,28 +1315,28 @@ ; GFX10-WGP-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_monotonic_cmpxchg: @@ -1392,15 +1392,15 @@ ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1409,14 +1409,14 @@ ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1474,15 +1474,15 @@ ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1491,14 +1491,14 @@ ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1554,13 +1554,13 @@ ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1569,13 +1569,13 @@ ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1632,15 +1632,15 @@ ; GFX10-WGP-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1649,14 +1649,14 @@ ; GFX10-CU-LABEL: global_workgroup_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1714,15 +1714,15 @@ ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1731,14 +1731,14 @@ ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1796,15 +1796,15 @@ ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1813,14 +1813,14 @@ ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1878,15 +1878,15 @@ ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv @@ -1895,14 +1895,14 @@ ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-CU-NEXT: s_endpgm ; @@ -1963,30 +1963,30 @@ ; GFX10-WGP-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_monotonic_ret_cmpxchg: @@ -2050,33 +2050,33 @@ ; GFX10-WGP-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_monotonic_ret_cmpxchg: @@ -2141,33 +2141,33 @@ ; GFX10-WGP-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_monotonic_ret_cmpxchg: @@ -2230,30 +2230,30 @@ ; GFX10-WGP-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acquire_acquire_ret_cmpxchg: @@ -2317,33 +2317,33 @@ ; GFX10-WGP-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_release_acquire_ret_cmpxchg: @@ -2408,33 +2408,33 @@ ; GFX10-WGP-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_acq_rel_acquire_ret_cmpxchg: @@ -2499,33 +2499,33 @@ ; GFX10-WGP-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_acquire_ret_cmpxchg: @@ -2590,33 +2590,33 @@ ; GFX10-WGP-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_seq_cst_seq_cst_ret_cmpxchg: @@ -3759,25 +3759,25 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_monotonic_monotonic_cmpxchg: @@ -3828,13 +3828,13 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -3842,13 +3842,13 @@ ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_cmpxchg: @@ -3899,27 +3899,27 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_monotonic_cmpxchg: @@ -3970,15 +3970,15 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -3986,13 +3986,13 @@ ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_cmpxchg: @@ -4043,15 +4043,15 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4059,13 +4059,13 @@ ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_cmpxchg: @@ -4116,13 +4116,13 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4130,13 +4130,13 @@ ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_cmpxchg: @@ -4187,15 +4187,15 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4203,13 +4203,13 @@ ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_cmpxchg: @@ -4260,15 +4260,15 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4276,13 +4276,13 @@ ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_cmpxchg: @@ -4333,15 +4333,15 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4349,13 +4349,13 @@ ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_cmpxchg: @@ -4406,15 +4406,15 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-WGP-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-WGP-NEXT: buffer_gl0_inv ; GFX10-WGP-NEXT: s_endpgm @@ -4422,13 +4422,13 @@ ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[0:1] offset:16 +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v2, v[0:1], s[2:3] offset:16 ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_cmpxchg: @@ -4485,30 +4485,30 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_monotonic_ret_cmpxchg: @@ -4569,32 +4569,32 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_monotonic_ret_cmpxchg: @@ -4655,32 +4655,32 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_monotonic_ret_cmpxchg: @@ -4741,30 +4741,30 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acquire_acquire_ret_cmpxchg: @@ -4825,32 +4825,32 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_release_acquire_ret_cmpxchg: @@ -4911,32 +4911,32 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_acq_rel_acquire_ret_cmpxchg: @@ -4997,32 +4997,32 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_acquire_ret_cmpxchg: @@ -5083,32 +5083,32 @@ ; GFX10-WGP-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-WGP: ; %bb.0: ; %entry ; GFX10-WGP-NEXT: s_clause 0x1 -; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-WGP-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-WGP-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-WGP-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-WGP-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s3 +; GFX10-WGP-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-WGP-NEXT: v_mov_b32_e32 v1, s1 ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: s_waitcnt_vscnt null, 0x0 -; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-WGP-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-WGP-NEXT: s_waitcnt vmcnt(0) ; GFX10-WGP-NEXT: buffer_gl0_inv -; GFX10-WGP-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-WGP-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-WGP-NEXT: s_endpgm ; ; GFX10-CU-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: ; GFX10-CU: ; %bb.0: ; %entry ; GFX10-CU-NEXT: s_clause 0x1 -; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x8 -; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x0 +; GFX10-CU-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x8 +; GFX10-CU-NEXT: s_load_dwordx2 s[2:3], s[4:5], 0x0 ; GFX10-CU-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-CU-NEXT: s_waitcnt lgkmcnt(0) -; GFX10-CU-NEXT: v_mov_b32_e32 v0, s2 -; GFX10-CU-NEXT: v_mov_b32_e32 v1, s3 -; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[0:1] offset:16 glc +; GFX10-CU-NEXT: v_mov_b32_e32 v0, s0 +; GFX10-CU-NEXT: v_mov_b32_e32 v1, s1 +; GFX10-CU-NEXT: global_atomic_cmpswap v0, v2, v[0:1], s[2:3] offset:16 glc ; GFX10-CU-NEXT: s_waitcnt vmcnt(0) -; GFX10-CU-NEXT: global_store_dword v2, v0, s[0:1] +; GFX10-CU-NEXT: global_store_dword v2, v0, s[2:3] ; GFX10-CU-NEXT: s_endpgm ; ; SKIP-CACHE-INV-LABEL: global_workgroup_one_as_seq_cst_seq_cst_ret_cmpxchg: Index: llvm/test/CodeGen/AMDGPU/memory_clause.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/memory_clause.ll +++ llvm/test/CodeGen/AMDGPU/memory_clause.ll @@ -184,8 +184,9 @@ ; GCN-NEXT: buffer_load_dword v15, v0, s[0:3], 0 offen offset:48 ; GCN-NEXT: buffer_load_dword v16, v0, s[0:3], 0 offen offset:52 ; GCN-NEXT: buffer_load_dword v17, v0, s[0:3], 0 offen offset:56 -; GCN-NEXT: v_add_u32_e32 v1, v1, v2 +; GCN-NEXT: s_nop 0 ; GCN-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen offset:60 +; GCN-NEXT: v_add_u32_e32 v1, v1, v2 ; GCN-NEXT: s_waitcnt vmcnt(12) ; GCN-NEXT: buffer_store_dword v6, v1, s[0:3], 0 offen offset:12 ; GCN-NEXT: buffer_store_dword v5, v1, s[0:3], 0 offen offset:8 Index: llvm/test/CodeGen/AMDGPU/memory_clause.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/memory_clause.mir +++ llvm/test/CodeGen/AMDGPU/memory_clause.mir @@ -1,26 +1,46 @@ -# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass=phi-node-elimination,si-form-memory-clauses %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx902 -verify-machineinstrs -run-pass=si-form-memory-clauses %s -o - | FileCheck -check-prefix=GCN %s # GCN-LABEL: {{^}}name: vector_clause{{$}} -# GCN: early-clobber %2:vreg_128, early-clobber %4:vreg_128, early-clobber %1:vreg_128, early-clobber %3:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec -# GCN-NEXT: } -# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, implicit $exec +# GCN: %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0{{$}} +# GCN-NEXT: %5:vreg_64 = IMPLICIT_DEF +# GCN-NEXT: GLOBAL_STORE_DWORDX4 %5, %1, 0, 0, 0, 0, implicit $exec --- name: vector_clause tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_128 } - - { id: 2, class: vreg_128 } - - { id: 3, class: vreg_128 } - - { id: 4, class: vreg_128 } body: | bb.0: - %0 = IMPLICIT_DEF + %0:vreg_64 = IMPLICIT_DEF + %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec + %5:vreg_64 = IMPLICIT_DEF + GLOBAL_STORE_DWORDX4 %5, %1, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %5, %2, 16, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %5, %3, 32, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %5, %4, 48, 0, 0, 0, implicit $exec +... + +# This would be a valid soft clause, but there's no need for a KILL +# since the pointer uses are live beyond the end the clause. +# GCN-LABEL: {{^}}name: vector_clause_no_kill{{$}} +# GCN: %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: GLOBAL_STORE_DWORDX4 %0, %1, 0, 0, 0, 0, implicit $exec + +--- +name: vector_clause_no_kill +tracksRegLiveness: true +body: | + bb.0: + %0:vreg_64 = IMPLICIT_DEF %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec @@ -32,12 +52,11 @@ ... # GCN-LABEL: {{^}}name: subreg_full{{$}} -# GCN: early-clobber %1:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: internal %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: internal %1.sub2:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: internal %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: } +# GCN: undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: %1.sub2:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0.sub2_sub3{{$}} # GCN-NEXT: GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, implicit $exec --- @@ -57,11 +76,10 @@ ... # GCN-LABEL: {{^}}name: subreg_part{{$}} -# GCN: undef early-clobber %1.sub0_sub1:vreg_128, undef early-clobber %1.sub3:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: internal %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: internal %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: } +# GCN: undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0.sub2_sub3{{$}} # GCN-NEXT: GLOBAL_STORE_DWORDX4 %0.sub0_sub1, %1, 0, 0, 0, 0, implicit $exec --- @@ -80,12 +98,11 @@ ... # GCN-LABEL: {{^}}name: dead{{$}} -# GCN: dead early-clobber %2:vreg_128, dead early-clobber %4:vreg_128, dead early-clobber %1:vreg_128, dead early-clobber %3:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec -# GCN-NEXT: } +# GCN: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0{{$}} --- name: dead @@ -106,56 +123,48 @@ ... # GCN-LABEL: {{^}}name: subreg_dead{{$}} -# GCN: early-clobber %1:vreg_64 = BUNDLE %0, implicit $exec { -# GCN-NEXT: %1.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: } -# GCN-NEXT: GLOBAL_STORE_DWORD %0, %1.sub0, 0, 0, 0, 0, implicit $exec +# GCN: undef %2.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %2.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0{{$}} +# GCN-NEXT: GLOBAL_STORE_DWORD %1, %2.sub0, 0, 0, 0, 0, implicit $exec --- name: subreg_dead tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_64 } body: | bb.0: - %0 = IMPLICIT_DEF - undef %1.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec - dead %1.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORD %0, %1.sub0, 0, 0, 0, 0, implicit $exec + %0:vreg_64 = IMPLICIT_DEF + %1:vreg_64 = IMPLICIT_DEF + undef %2.sub0:vreg_64 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec + dead %2.sub1:vreg_64 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORD %1, %2.sub0, 0, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: kill{{$}} -# GCN: early-clobber %2:vreg_128, early-clobber %3:vreg_128 = BUNDLE %0, %1, implicit $exec { -# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: } +# GCN: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %1{{$}} +# GCN-NEXT: KILL %0{{$}} --- name: kill tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_64 } - - { id: 2, class: vreg_128 } - - { id: 3, class: vreg_128 } body: | bb.0: - %0 = IMPLICIT_DEF - %1 = IMPLICIT_DEF - %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec - %3:vreg_128 = GLOBAL_LOAD_DWORDX4 killed %1, 16, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %2, 0, 0, 0, 0, implicit $exec - GLOBAL_STORE_DWORDX4 %0, %3, 16, 0, 0, 0, implicit $exec + %0:vreg_64 = IMPLICIT_DEF + %1:vreg_64 = IMPLICIT_DEF + %2:vreg_64 = IMPLICIT_DEF + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 killed %1, 16, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %2, %3, 0, 0, 0, 0, implicit $exec + GLOBAL_STORE_DWORDX4 %2, %4, 16, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: indirect{{$}} # GCN: %1:vreg_64 = GLOBAL_LOAD_DWORDX2 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: early-clobber %2:vreg_128, early-clobber %3:vreg_128 = BUNDLE %1, implicit $exec { -# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: } +# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %1{{$}} --- name: indirect @@ -200,115 +209,84 @@ ... # GCN-LABEL: {{^}}name: overflow_counter{{$}} -# GCN: dead early-clobber %7:vgpr_32, dead early-clobber %14:vgpr_32, dead early-clobber %2:vgpr_32, dead early-clobber %9:vgpr_32, dead early-clobber %4:vgpr_32, dead early-clobber %11:vgpr_32, dead early-clobber %6:vgpr_32, dead early-clobber %13:vgpr_32, dead early-clobber %1:vgpr_32, dead early-clobber %8:vgpr_32, dead early-clobber %15:vgpr_32, dead early-clobber %3:vgpr_32, dead early-clobber %10:vgpr_32, dead early-clobber %5:vgpr_32, dead early-clobber %12:vgpr_32 = BUNDLE %0, implicit $exec { -# GCN-NEXT: dead %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, implicit $exec -# GCN-NEXT: } -# GCN-NEXT: dead early-clobber %16:vgpr_32, dead early-clobber %17:vgpr_32 = BUNDLE %0, implicit $exec { -# GCN-NEXT: dead %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 60, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %17:vgpr_32 = GLOBAL_LOAD_DWORD %0, 64, 0, 0, 0, implicit $exec -# GCN-NEXT: } +# GCN: dead %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0{{$}} +# GCN-NEXT: dead %17:vgpr_32 = GLOBAL_LOAD_DWORD %1, 60, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %18:vgpr_32 = GLOBAL_LOAD_DWORD %1, 64, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %1{{$}} --- name: overflow_counter tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vgpr_32 } - - { id: 2, class: vgpr_32 } - - { id: 3, class: vgpr_32 } - - { id: 4, class: vgpr_32 } - - { id: 5, class: vgpr_32 } - - { id: 6, class: vgpr_32 } - - { id: 7, class: vgpr_32 } - - { id: 8, class: vgpr_32 } - - { id: 9, class: vgpr_32 } - - { id: 10, class: vgpr_32 } - - { id: 11, class: vgpr_32 } - - { id: 12, class: vgpr_32 } - - { id: 13, class: vgpr_32 } - - { id: 14, class: vgpr_32 } - - { id: 15, class: vgpr_32 } - - { id: 16, class: vgpr_32 } - - { id: 17, class: vgpr_32 } body: | bb.0: - %0 = IMPLICIT_DEF - %1:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec - %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec - %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, implicit $exec - %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, implicit $exec - %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec - %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, implicit $exec - %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, implicit $exec - %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, implicit $exec - %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec - %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, implicit $exec - %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, implicit $exec - %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, implicit $exec - %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, implicit $exec - %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, implicit $exec - %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, implicit $exec - %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 60, 0, 0, 0, implicit $exec - %17:vgpr_32 = GLOBAL_LOAD_DWORD %0, 64, 0, 0, 0, implicit $exec + %0:vreg_64 = IMPLICIT_DEF + %1:vreg_64 = IMPLICIT_DEF + %2:vgpr_32 = GLOBAL_LOAD_DWORD %0, 0, 0, 0, 0, implicit $exec + %3:vgpr_32 = GLOBAL_LOAD_DWORD %0, 4, 0, 0, 0, implicit $exec + %4:vgpr_32 = GLOBAL_LOAD_DWORD %0, 8, 0, 0, 0, implicit $exec + %5:vgpr_32 = GLOBAL_LOAD_DWORD %0, 12, 0, 0, 0, implicit $exec + %6:vgpr_32 = GLOBAL_LOAD_DWORD %0, 16, 0, 0, 0, implicit $exec + %7:vgpr_32 = GLOBAL_LOAD_DWORD %0, 20, 0, 0, 0, implicit $exec + %8:vgpr_32 = GLOBAL_LOAD_DWORD %0, 24, 0, 0, 0, implicit $exec + %9:vgpr_32 = GLOBAL_LOAD_DWORD %0, 28, 0, 0, 0, implicit $exec + %10:vgpr_32 = GLOBAL_LOAD_DWORD %0, 32, 0, 0, 0, implicit $exec + %11:vgpr_32 = GLOBAL_LOAD_DWORD %0, 36, 0, 0, 0, implicit $exec + %12:vgpr_32 = GLOBAL_LOAD_DWORD %0, 40, 0, 0, 0, implicit $exec + %13:vgpr_32 = GLOBAL_LOAD_DWORD %0, 44, 0, 0, 0, implicit $exec + %14:vgpr_32 = GLOBAL_LOAD_DWORD %0, 48, 0, 0, 0, implicit $exec + %15:vgpr_32 = GLOBAL_LOAD_DWORD %0, 52, 0, 0, 0, implicit $exec + %16:vgpr_32 = GLOBAL_LOAD_DWORD %0, 56, 0, 0, 0, implicit $exec + %17:vgpr_32 = GLOBAL_LOAD_DWORD %1, 60, 0, 0, 0, implicit $exec + %18:vgpr_32 = GLOBAL_LOAD_DWORD %1, 64, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: reg_pressure{{$}} -# GCN: dead early-clobber %2:vreg_128, dead early-clobber %4:vreg_128, dead early-clobber %1:vreg_128, dead early-clobber %3:vreg_128, dead early-clobber %5:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, implicit $exec -# GCN-NEXT: } -# GCN-NEXT: dead early-clobber %7:vreg_128, dead early-clobber %6:vreg_128 = BUNDLE %0, implicit $exec { -# GCN-NEXT: dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, 0, 0, implicit $exec -# GCN-NEXT: } +# GCN: dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0{{$}} +# GCN-NEXT: dead %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 80, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 96, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %1{{$}} --- name: reg_pressure tracksRegLiveness: true -registers: - - { id: 0, class: vreg_64 } - - { id: 1, class: vreg_128 } - - { id: 2, class: vreg_128 } - - { id: 3, class: vreg_128 } - - { id: 4, class: vreg_128 } - - { id: 5, class: vreg_128 } - - { id: 6, class: vreg_128 } - - { id: 7, class: vreg_128 } body: | bb.0: - %0 = IMPLICIT_DEF - %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec - %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec - %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec - %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec - %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, implicit $exec - %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 80, 0, 0, 0, implicit $exec - %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 96, 0, 0, 0, implicit $exec + %0:vreg_64 = IMPLICIT_DEF + %1:vreg_64 = IMPLICIT_DEF + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec + %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 48, 0, 0, 0, implicit $exec + %6:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 64, 0, 0, 0, implicit $exec + %7:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 80, 0, 0, 0, implicit $exec + %8:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 96, 0, 0, 0, implicit $exec ... # GCN-LABEL: {{^}}name: image_clause{{$}} -# GCN: early-clobber %4:vreg_128, early-clobber %3:vreg_128, early-clobber %5:vreg_128 = BUNDLE %0, undef %2:sgpr_128, %1, implicit $exec { -# GCN-NEXT: %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: } +# GCN: %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %4:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %5:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, undef %2:sgpr_128, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL undef %2:sgpr_128{{$}} # GCN-NEXT: IMAGE_STORE_V4_V2 %3, %0, %1, 15, -1, 0, 0, 0, 0, 0, 0, 0, implicit $exec --- @@ -334,11 +312,12 @@ ... # GCN-LABEL: {{^}}name: mixed_clause{{$}} -# GCN: dead early-clobber %4:vreg_128, dead early-clobber %3:vreg_128, dead early-clobber %5:vgpr_32 = BUNDLE %0, %2, %1, implicit $exec { -# GCN-NEXT: dead %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: dead %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec -# GCN-NEXT: } +# GCN: dead %3:vreg_128 = IMAGE_SAMPLE_LZ_V4_V2 %0, %1, %2, 15, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %5:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET %2, 0, 0, 0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %1{{$}} +# GCN-NEXT: KILL %2{{$}} +# GCN-NEXT: KILL %0{{$}} --- name: mixed_clause @@ -386,3 +365,164 @@ FLAT_ATOMIC_ADD %0, %1, 0, 0, implicit $exec, implicit $flat_scr S_ENDPGM 0 ... + + +# One of the pointers has an additional use after the clause, but one +# doesn't. Only the final use should be killed. + +# GCN-LABEL: {{^}}name: ptr_use_after_clause{{$}} +# GCN: dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %1{{$}} +# GCN-NEXT: S_NOP 0, implicit %0 +--- +name: ptr_use_after_clause +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1, $vgpr2_vgpr3 + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_64 = COPY $vgpr2_vgpr3 + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 16, 0, 0, 0, implicit $exec + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 32, 0, 0, 0, implicit $exec + %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %1, 48, 0, 0, 0, implicit $exec + S_NOP 0, implicit %0 +... + +# Only part of the register is really live past the clause. +# GCN-LABEL: {{^}}name: ptr_use_after_clause_subreg{{$}} +# GCN: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub2_sub3, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub2_sub3, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0.sub2_sub3{{$}} +# GCN-NEXT: S_NOP 0, implicit %0.sub0_sub1{{$}} +--- +name: ptr_use_after_clause_subreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 0, 0, 0, 0, implicit $exec + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub2_sub3, 16, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 32, 0, 0, 0, implicit $exec + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub2_sub3, 48, 0, 0, 0, implicit $exec + S_NOP 0, implicit %0.sub0_sub1 +... + +# More complex situation where only some of the use subregisters live +# beyond the clause. + +# GCN-LABEL: {{^}}name: ptr_use_after_clause_subreg_multi{{$}} +# GCN: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub2_sub3, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub3_sub4, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub6_sub7, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0.sub2_sub3_sub4, %0.sub7{{$}} +# GCN-NEXT: S_NOP 0, implicit %0.sub0_sub1, implicit %0.sub5_sub6 +--- +name: ptr_use_after_clause_subreg_multi +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %0:vreg_256 = COPY $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 + %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 0, 0, 0, 0, implicit $exec + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub2_sub3, 16, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub3_sub4, 32, 0, 0, 0, implicit $exec + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub6_sub7, 48, 0, 0, 0, implicit $exec + S_NOP 0, implicit %0.sub0_sub1, implicit %0.sub5_sub6 +... + +# Have subranges, but none of them are killed +# GCN-LABEL: {{^}}name: no_killed_subranges{{$}} +# GCN: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub2_sub3, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub2_sub3, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: S_NOP 0, implicit %0.sub0_sub1{{$}} +# GCN-NEXT: S_NOP 0, implicit %0.sub2_sub3{{$}} +--- +name: no_killed_subranges +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 0, 0, 0, 0, implicit $exec + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub2_sub3, 16, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 32, 0, 0, 0, implicit $exec + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub2_sub3, 48, 0, 0, 0, implicit $exec + S_NOP 0, implicit %0.sub0_sub1 + S_NOP 0, implicit %0.sub2_sub3 +... + +# sub2 is undef at entry to the soft clause. It should not be have its +# live range extended. + +# GCN-LABEL: name: no_killed_undef_subrange_use +# GCN: dead %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: dead %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0.sub0_sub1{{$}} +# GCN-NEXT: %0.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec +# GCN-NEXT: S_NOP 0, implicit %0.sub2 +--- +name: no_killed_undef_subrange_use +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + undef %0.sub0_sub1:vreg_128 = COPY $vgpr0_vgpr1 + %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 0, 0, 0, 0, implicit $exec + %3:vreg_128 = GLOBAL_LOAD_DWORDX4 %0.sub0_sub1, 32, 0, 0, 0, implicit $exec + %0.sub2:vreg_128 = V_MOV_B32_e32 0, implicit $exec + S_NOP 0, implicit %0.sub2 +... + +# Make sure intervening implicit_defs are not treated as breaking the +# clause +# +# GCN-LABEL: {{^}}name: implicit_def_no_break{{$}} +# GCN: %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: %3:vreg_64 = IMPLICIT_DEF +# GCN-NEXT: %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %3, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %3, 48, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %3{{$}} +# GCN-NEXT: KILL %0{{$}} +--- +name: implicit_def_no_break +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1 + %0:vreg_64 = COPY $vgpr0_vgpr1 + %1:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 0, 0, 0, 0, implicit $exec + %2:vreg_128 = GLOBAL_LOAD_DWORDX4 %0, 16, 0, 0, 0, implicit $exec + %3:vreg_64 = IMPLICIT_DEF + %4:vreg_128 = GLOBAL_LOAD_DWORDX4 %3, 32, 0, 0, 0, implicit $exec + %5:vreg_128 = GLOBAL_LOAD_DWORDX4 %3, 48, 0, 0, 0, implicit $exec + S_NOP 0, implicit %1, implicit %2, implicit %4, implicit %5 +... + +# GCN-LABEL: {{^}}name: kill_part_subreg{{$}} +# GCN: undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec +# GCN-NEXT: %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec +# GCN-NEXT: %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec +# GCN-NEXT: KILL %0.sub0_sub1_sub2, %0.sub3 +--- +name: kill_part_subreg +tracksRegLiveness: true +body: | + bb.0: + liveins: $vgpr0_vgpr1_vgpr2_vgpr3 + %0:vreg_128 = COPY $vgpr0_vgpr1_vgpr2_vgpr3 + undef %1.sub0:vreg_128 = GLOBAL_LOAD_DWORD %0.sub0_sub1, 0, 0, 0, 0, implicit $exec + %1.sub1:vreg_128 = GLOBAL_LOAD_DWORD %0.sub1_sub2, 16, 0, 0, 0, implicit $exec + %1.sub3:vreg_128 = GLOBAL_LOAD_DWORD %0.sub2_sub3, 32, 0, 0, 0, implicit $exec + S_NOP 0, implicit %1 +... Index: llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir +++ llvm/test/CodeGen/AMDGPU/postra-bundle-memops.mir @@ -278,3 +278,40 @@ $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec KILL killed $vgpr7, killed $vgpr3 ... + +--- +name: post_bundle_multi_kill_0 +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + + ; GCN-LABEL: name: post_bundle_multi_kill_0 + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + KILL killed $vgpr3_vgpr4 + KILL killed $vgpr5_vgpr6 +... + + +--- +name: post_bundle_multi_kill_1 +body: | + bb.0: + liveins: $vgpr3_vgpr4, $vgpr5_vgpr6 + + ; GCN-LABEL: name: post_bundle_multi_kill_1 + ; GCN: BUNDLE implicit-def $vgpr0, implicit-def $vgpr0_lo16, implicit-def $vgpr0_hi16, implicit-def $vgpr1, implicit-def $vgpr1_lo16, implicit-def $vgpr1_hi16, implicit $vgpr3_vgpr4, implicit $exec, implicit $vgpr5_vgpr6 { + ; GCN: $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + ; GCN: $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + ; GCN: } + $vgpr0 = GLOBAL_LOAD_DWORD $vgpr3_vgpr4, 0, 0, 0, 0, implicit $exec + $vgpr1 = GLOBAL_LOAD_DWORD $vgpr5_vgpr6, 0, 0, 0, 0, implicit $exec + KILL killed $vgpr3 + KILL $vgpr4 + KILL killed $vgpr5 + KILL killed $vgpr6 +... Index: llvm/test/CodeGen/AMDGPU/soft-clause-dbg-value.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/soft-clause-dbg-value.mir +++ llvm/test/CodeGen/AMDGPU/soft-clause-dbg-value.mir @@ -14,22 +14,19 @@ ; CHECK-LABEL: name: sgpr_clause_dbg_value ; CHECK: liveins: $sgpr4_sgpr5, $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7, $vgpr8, $vgpr9, $vgpr10, $vgpr11, $vgpr12, $vgpr13, $vgpr14, $vgpr15, $vgpr16, $vgpr17, $vgpr18, $vgpr19, $vgpr20, $vgpr21, $vgpr22, $vgpr23, $vgpr24 ; CHECK: [[COPY:%[0-9]+]]:sreg_64 = COPY $sgpr4_sgpr5 - ; CHECK: early-clobber %2:sreg_32_xm0_xexec, early-clobber %1:sreg_32_xm0_xexec = BUNDLE [[COPY]] { - ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0, 0 :: (load 4, addrspace 4) - ; CHECK: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 8, 0, 0 :: (load 4, addrspace 4) - ; CHECK: } + ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 0, 0, 0 :: (load 4, addrspace 4) ; CHECK: DBG_VALUE [[S_LOAD_DWORD_IMM]], 0, 0 + ; CHECK: [[S_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 8, 0, 0 :: (load 4, addrspace 4) ; CHECK: DBG_VALUE [[S_LOAD_DWORD_IMM1]], 0, 0 ; CHECK: S_NOP 0 ; CHECK: S_NOP 0 ; CHECK: S_NOP 0 - ; CHECK: early-clobber %4:sreg_32_xm0_xexec, early-clobber %3:sreg_32_xm0_xexec, early-clobber %5:sreg_32_xm0_xexec = BUNDLE [[COPY]] { - ; CHECK: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 16, 0, 0 :: (load 4, addrspace 4) - ; CHECK: [[S_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 32, 0, 0 :: (load 4, addrspace 4) - ; CHECK: [[S_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 64, 0, 0 :: (load 4, addrspace 4) - ; CHECK: } + ; CHECK: [[S_LOAD_DWORD_IMM2:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 16, 0, 0 :: (load 4, addrspace 4) + ; CHECK: [[S_LOAD_DWORD_IMM3:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 32, 0, 0 :: (load 4, addrspace 4) ; CHECK: DBG_VALUE [[S_LOAD_DWORD_IMM2]], 0, 0 ; CHECK: DBG_VALUE [[S_LOAD_DWORD_IMM3]], 0, 0 + ; CHECK: [[S_LOAD_DWORD_IMM4:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM [[COPY]], 64, 0, 0 :: (load 4, addrspace 4) + ; CHECK: KILL [[COPY]] ; CHECK: S_ENDPGM 0, implicit [[S_LOAD_DWORD_IMM]], implicit [[S_LOAD_DWORD_IMM1]], implicit [[S_LOAD_DWORD_IMM2]], implicit [[S_LOAD_DWORD_IMM3]], implicit [[S_LOAD_DWORD_IMM4]] %0:sreg_64 = COPY $sgpr4_sgpr5 %1:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM %0, 0, 0, 0 :: (load 4, align 4, addrspace 4) Index: llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll +++ llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr.ll @@ -246,7 +246,7 @@ ; GFX908-DAG: v_accvgpr_read_b32 ; GCN: NumVgprs: 256 -; GFX900: ScratchSize: 1156 +; GFX900: ScratchSize: 1796 ; GFX908-FIXME: ScratchSize: 0 ; GCN: VGPRBlocks: 63 ; GCN: NumVGPRsForWavesPerEU: 256 Index: llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll +++ llvm/test/CodeGen/AMDGPU/splitkit-getsubrangeformask.ll @@ -36,20 +36,21 @@ ; CHECK: [[S_ASHR_I32_2:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_2]], 31, implicit-def dead $scc ; CHECK: undef %130.sub0:sreg_64 = S_ADD_U32 [[COPY4]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %130.sub1:sreg_64 = S_ADDC_U32 undef %54:sreg_32, [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc - ; CHECK: early-clobber %73:sgpr_128, early-clobber %143:sgpr_128, early-clobber %131:sreg_32_xm0_xexec = BUNDLE %130, undef %132:sgpr_128, undef %74:sreg_64 { - ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0, 0 :: (load 16 from %ir.84, addrspace 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0, 0 :: (load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4) - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: } + ; CHECK: [[S_LOAD_DWORDX4_IMM:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %130, 16, 0, 0 :: (load 16 from %ir.84, addrspace 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM1:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM undef %74:sreg_64, 0, 0, 0 :: (load 16 from `<4 x i32> addrspace(4)* undef`, addrspace 4) + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM undef %132:sgpr_128, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: KILL undef %74:sreg_64 + ; CHECK: KILL undef %132:sgpr_128 + ; CHECK: KILL %130.sub0, %130.sub1 ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM1:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM [[S_LOAD_DWORDX4_IMM]], 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[V_MOV_B32_e32_:%[0-9]+]]:vgpr_32 = V_MOV_B32_e32 0, implicit $exec ; CHECK: %71.sub3:sgpr_128 = S_MOV_B32 553734060 ; CHECK: %71.sub2:sgpr_128 = S_MOV_B32 -1 - ; CHECK: early-clobber %87:vgpr_32, early-clobber %117:vgpr_32, early-clobber %76:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM1]], undef %118:sgpr_128, undef %89:sgpr_128, [[V_MOV_B32_e32_]], implicit $exec { - ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: } + ; CHECK: [[BUFFER_LOAD_DWORD_OFFSET:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_DWORD_OFFSET undef %118:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], undef %89:sgpr_128, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN1:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM1]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: KILL undef %89:sgpr_128 + ; CHECK: KILL undef %118:sgpr_128 ; CHECK: SI_SPILL_S128_SAVE %71, %stack.1, implicit $exec, implicit $sgpr32 :: (store 16 into %stack.1, align 4, addrspace 5) ; CHECK: %71.sub1:sgpr_128 = S_MOV_B32 0 ; CHECK: [[S_SUB_I32_:%[0-9]+]]:sreg_32 = S_SUB_I32 [[S_BUFFER_LOAD_DWORD_IMM]], 29, implicit-def dead $scc @@ -64,6 +65,8 @@ ; CHECK: undef %163.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: [[S_LOAD_DWORDX4_IMM2:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %149, 0, 0, 0 :: (load 16 from %ir.91, addrspace 4) ; CHECK: [[S_LOAD_DWORDX4_IMM3:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %156, 0, 0, 0 :: (load 16 from %ir.97, addrspace 4) + ; CHECK: KILL %156.sub0, %156.sub1 + ; CHECK: KILL %149.sub0, %149.sub1 ; CHECK: %163.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_ASHR_I32_3:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 undef %171:sreg_32, 31, implicit-def dead $scc ; CHECK: undef %176.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_]], undef %171:sreg_32, implicit-def $scc @@ -201,11 +204,12 @@ ; CHECK: %468.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_7]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0_sub1:sgpr_128 = S_LOAD_DWORDX2_IMM %468, 0, 0, 0 :: (load 8 from %ir.316, addrspace 4) ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 %71.sub1, [[S_MOV_B32_]], implicit-def dead $scc - ; CHECK: early-clobber %412:sgpr_128, early-clobber %487:sreg_32_xm0_xexec, early-clobber %475:sreg_32_xm0_xexec = BUNDLE %71, undef %488:sreg_64, %411 { - ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) - ; CHECK: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0, 0 :: (load 16 from %ir.278, addrspace 4) - ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0, 0 :: (load 4 from `i32 addrspace(4)* undef`, addrspace 4) - ; CHECK: } + ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM6:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) + ; CHECK: [[S_LOAD_DWORDX4_IMM23:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %411, 0, 0, 0 :: (load 16 from %ir.278, addrspace 4) + ; CHECK: [[S_LOAD_DWORD_IMM:%[0-9]+]]:sreg_32_xm0_xexec = S_LOAD_DWORD_IMM undef %488:sreg_64, 0, 0, 0 :: (load 4 from `i32 addrspace(4)* undef`, addrspace 4) + ; CHECK: KILL %411.sub0, %411.sub1 + ; CHECK: KILL undef %488:sreg_64 + ; CHECK: KILL %71.sub0_sub1 ; CHECK: [[S_LSHL_B32_8:%[0-9]+]]:sreg_32 = S_LSHL_B32 [[COPY11]], 3, implicit-def dead $scc ; CHECK: [[S_LOAD_DWORDX4_IMM24:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %425, 0, 0, 0 :: (load 16 from %ir.287, addrspace 4) ; CHECK: [[S_ASHR_I32_8:%[0-9]+]]:sreg_32_xm0 = S_ASHR_I32 [[S_LSHL_B32_8]], 31, implicit-def dead $scc @@ -213,10 +217,10 @@ ; CHECK: undef %485.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_4]], [[S_LSHL_B32_8]], implicit-def $scc ; CHECK: %485.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_4]], [[S_ASHR_I32_8]], implicit-def dead $scc, implicit $scc ; CHECK: %71.sub0:sgpr_128 = S_LOAD_DWORD_IMM %485, 0, 0, 0 :: (load 4 from %ir..i0100.i, align 8, addrspace 4) - ; CHECK: early-clobber %413:vgpr_32, early-clobber %427:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM23]], [[S_LOAD_DWORDX4_IMM24]], [[V_MOV_B32_e32_]], implicit $exec { - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: } + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN21:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM23]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN22:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM24]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: KILL [[S_LOAD_DWORDX4_IMM24]] + ; CHECK: KILL [[S_LOAD_DWORDX4_IMM23]] ; CHECK: %71.sub1:sgpr_128 = S_AND_B32 [[S_LOAD_DWORD_IMM]], [[S_MOV_B32_]], implicit-def dead $scc ; CHECK: [[S_BUFFER_LOAD_DWORD_IMM7:%[0-9]+]]:sreg_32_xm0_xexec = S_BUFFER_LOAD_DWORD_IMM %71, 0, 0, 0 :: (dereferenceable invariant load 4) ; CHECK: [[S_ADD_I32_18:%[0-9]+]]:sreg_32 = S_ADD_I32 [[S_BUFFER_LOAD_DWORD_IMM]], -474, implicit-def dead $scc @@ -237,11 +241,13 @@ ; CHECK: undef %530.sub0:sreg_64 = S_ADD_U32 [[S_ADD_U32_5]], [[S_LSHL_B32_2]], implicit-def $scc ; CHECK: %530.sub1:sreg_64 = S_ADDC_U32 [[S_ADDC_U32_5]], [[S_ASHR_I32_2]], implicit-def dead $scc, implicit $scc ; CHECK: [[S_LOAD_DWORDX4_IMM27:%[0-9]+]]:sgpr_128 = S_LOAD_DWORDX4_IMM %530, 0, 0, 0 :: (load 16 from %ir.359, addrspace 4) - ; CHECK: early-clobber %516:vgpr_32, early-clobber %532:vgpr_32, early-clobber %524:vgpr_32 = BUNDLE [[S_LOAD_DWORDX4_IMM26]], [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], [[S_LOAD_DWORDX4_IMM27]], implicit $exec { - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: } + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN23:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM25]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN24:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM26]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: [[BUFFER_LOAD_FORMAT_X_IDXEN25:%[0-9]+]]:vgpr_32 = BUFFER_LOAD_FORMAT_X_IDXEN [[V_MOV_B32_e32_]], [[S_LOAD_DWORDX4_IMM27]], 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 4 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: KILL [[S_LOAD_DWORDX4_IMM27]] + ; CHECK: KILL [[S_LOAD_DWORDX4_IMM25]] + ; CHECK: KILL [[V_MOV_B32_e32_]] + ; CHECK: KILL [[S_LOAD_DWORDX4_IMM26]] ; CHECK: [[V_ADD_U32_e32_:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -2, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec ; CHECK: [[V_ADD_U32_e32_1:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -1, [[BUFFER_LOAD_FORMAT_X_IDXEN1]], implicit $exec ; CHECK: [[V_ADD_U32_e32_2:%[0-9]+]]:vgpr_32 = V_ADD_U32_e32 -3, [[BUFFER_LOAD_FORMAT_X_IDXEN]], implicit $exec Index: llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir =================================================================== --- llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir +++ llvm/test/CodeGen/AMDGPU/unallocatable-bundle-regression.mir @@ -41,24 +41,23 @@ ; CHECK: renamable $sgpr14 = COPY renamable $sgpr5 ; CHECK: renamable $sgpr15 = COPY renamable $sgpr5 ; CHECK: renamable $vgpr5_vgpr6 = COPY renamable $sgpr0_sgpr1 - ; CHECK: early-clobber renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, early-clobber renamable $sgpr80_sgpr81_sgpr82_sgpr83 = BUNDLE renamable $sgpr4_sgpr5 { - ; CHECK: renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1088, 0, 0 :: (dereferenceable load 32, addrspace 6) - ; CHECK: renamable $sgpr80_sgpr81_sgpr82_sgpr83 = S_LOAD_DWORDX4_IMM renamable $sgpr4_sgpr5, 0, 0, 0 :: (load 16, addrspace 6) - ; CHECK: } + ; CHECK: renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1088, 0, 0 :: (dereferenceable load 32, addrspace 6) + ; CHECK: renamable $sgpr80_sgpr81_sgpr82_sgpr83 = S_LOAD_DWORDX4_IMM renamable $sgpr4_sgpr5, 0, 0, 0 :: (load 16, addrspace 6) ; CHECK: renamable $sgpr0 = S_MOV_B32 1200 ; CHECK: renamable $sgpr1 = COPY renamable $sgpr5 - ; CHECK: early-clobber renamable $sgpr84_sgpr85_sgpr86_sgpr87, early-clobber renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = BUNDLE renamable $sgpr0_sgpr1, renamable $sgpr4_sgpr5 { - ; CHECK: renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1152, 0, 0 :: (dereferenceable load 32, addrspace 6) - ; CHECK: renamable $sgpr84_sgpr85_sgpr86_sgpr87 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0, 0 :: (load 16, addrspace 6) - ; CHECK: } + ; CHECK: renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1152, 0, 0 :: (dereferenceable load 32, addrspace 6) + ; CHECK: renamable $sgpr84_sgpr85_sgpr86_sgpr87 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0, 0 :: (load 16, addrspace 6) + ; CHECK: KILL renamable $sgpr0, renamable $sgpr1 ; CHECK: renamable $sgpr0 = S_MOV_B32 1264 ; CHECK: renamable $sgpr1 = COPY renamable $sgpr5 ; CHECK: renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1216, 0, 0 :: (dereferenceable load 32, addrspace 6) ; CHECK: renamable $sgpr88_sgpr89_sgpr90_sgpr91 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0, 0 :: (load 16, addrspace 6) + ; CHECK: KILL renamable $sgpr0, renamable $sgpr1 ; CHECK: renamable $sgpr0 = S_MOV_B32 1328 ; CHECK: renamable $sgpr1 = COPY renamable $sgpr5 ; CHECK: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1280, 0, 0 :: (dereferenceable load 32, addrspace 6) ; CHECK: renamable $sgpr92_sgpr93_sgpr94_sgpr95 = S_LOAD_DWORDX4_IMM renamable $sgpr0_sgpr1, 0, 0, 0 :: (load 16, addrspace 6) + ; CHECK: KILL renamable $sgpr0, renamable $sgpr1 ; CHECK: renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 = S_LOAD_DWORDX8_IMM renamable $sgpr4_sgpr5, 1344, 0, 0 :: (dereferenceable load 32, addrspace 6) ; CHECK: renamable $sgpr0 = S_MOV_B32 1392 ; CHECK: renamable $sgpr1 = COPY renamable $sgpr5 @@ -73,15 +72,26 @@ ; CHECK: renamable $vgpr7 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, renamable $sgpr76_sgpr77_sgpr78_sgpr79, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") ; CHECK: renamable $vgpr8 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr16_sgpr17_sgpr18_sgpr19_sgpr20_sgpr21_sgpr22_sgpr23, killed renamable $sgpr80_sgpr81_sgpr82_sgpr83, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") ; CHECK: renamable $vgpr9 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr24_sgpr25_sgpr26_sgpr27_sgpr28_sgpr29_sgpr30_sgpr31, killed renamable $sgpr84_sgpr85_sgpr86_sgpr87, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") - ; CHECK: renamable $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, killed renamable $sgpr88_sgpr89_sgpr90_sgpr91, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") - ; CHECK: renamable $vgpr11 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, killed renamable $sgpr92_sgpr93_sgpr94_sgpr95, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") - ; CHECK: renamable $vgpr12 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, killed renamable $sgpr96_sgpr97_sgpr98_sgpr99, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") + ; CHECK: renamable $vgpr10 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43, renamable $sgpr88_sgpr89_sgpr90_sgpr91, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") + ; CHECK: renamable $vgpr11 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51, renamable $sgpr92_sgpr93_sgpr94_sgpr95, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") + ; CHECK: renamable $vgpr12 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67, renamable $sgpr96_sgpr97_sgpr98_sgpr99, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") + ; CHECK: renamable $vgpr13 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") + ; CHECK: renamable $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") ; CHECK: renamable $sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S128_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load 16 from %stack.0, align 4, addrspace 5) - ; CHECK: early-clobber renamable $vgpr14, early-clobber renamable $vgpr13, early-clobber renamable $vgpr1_vgpr2_vgpr3_vgpr4 = BUNDLE killed renamable $vgpr0, killed renamable $sgpr8_sgpr9_sgpr10_sgpr11, killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, killed renamable $sgpr0_sgpr1_sgpr2_sgpr3, killed renamable $vgpr5_vgpr6, killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, killed renamable $sgpr4_sgpr5_sgpr6_sgpr7, implicit $exec { - ; CHECK: renamable $vgpr13 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75, renamable $sgpr4_sgpr5_sgpr6_sgpr7, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") - ; CHECK: renamable $vgpr14 = IMAGE_SAMPLE_LZ_V1_V2_gfx10 renamable $vgpr5_vgpr6, renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59, renamable $sgpr0_sgpr1_sgpr2_sgpr3, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "ImageResource") - ; CHECK: renamable $vgpr1_vgpr2_vgpr3_vgpr4 = BUFFER_LOAD_FORMAT_XYZW_IDXEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) - ; CHECK: } + ; CHECK: renamable $vgpr1_vgpr2_vgpr3_vgpr4 = BUFFER_LOAD_FORMAT_XYZW_IDXEN renamable $vgpr0, renamable $sgpr8_sgpr9_sgpr10_sgpr11, 0, 0, 0, 0, 0, 0, 0, implicit $exec :: (dereferenceable load 16 from custom "BufferResource", align 1, addrspace 4) + ; CHECK: KILL killed renamable $sgpr4_sgpr5_sgpr6_sgpr7 + ; CHECK: KILL killed renamable $sgpr92_sgpr93_sgpr94_sgpr95 + ; CHECK: KILL killed renamable $sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75 + ; CHECK: KILL killed renamable $sgpr60_sgpr61_sgpr62_sgpr63_sgpr64_sgpr65_sgpr66_sgpr67 + ; CHECK: KILL killed renamable $sgpr36_sgpr37_sgpr38_sgpr39_sgpr40_sgpr41_sgpr42_sgpr43 + ; CHECK: KILL killed renamable $vgpr5_vgpr6 + ; CHECK: KILL killed renamable $sgpr0_sgpr1_sgpr2_sgpr3 + ; CHECK: KILL killed renamable $sgpr52_sgpr53_sgpr54_sgpr55_sgpr56_sgpr57_sgpr58_sgpr59 + ; CHECK: KILL killed renamable $sgpr96_sgpr97_sgpr98_sgpr99 + ; CHECK: KILL killed renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 + ; CHECK: KILL killed renamable $sgpr8_sgpr9_sgpr10_sgpr11 + ; CHECK: KILL killed renamable $sgpr88_sgpr89_sgpr90_sgpr91 + ; CHECK: KILL killed renamable $vgpr0 ; CHECK: renamable $vgpr0 = nofpexcept V_MAX_F32_e32 killed $vgpr7, killed $vgpr8, implicit $mode, implicit $exec ; CHECK: renamable $vgpr0 = V_MAX3_F32_e64 0, killed $vgpr0, 0, killed $vgpr9, 0, killed $vgpr10, 0, 0, implicit $mode, implicit $exec ; CHECK: renamable $vgpr1 = nofpexcept V_ADD_F32_e32 -1083321614, killed $vgpr12, implicit $mode, implicit $exec Index: llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll +++ llvm/test/CodeGen/AMDGPU/vector_shuffle.packed.ll @@ -19,11 +19,13 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[2:3], off -; GFX9-NEXT: global_load_dword v6, v[0:1], off offset:4 -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dword v5, v[0:1], off offset:4 +; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 +; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GFX9-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_mov_b32_e32 v0, v6 +; GFX9-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 @@ -811,14 +813,16 @@ ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-NEXT: global_load_dwordx2 v[4:5], v[0:1], off -; GFX9-NEXT: global_load_dwordx2 v[6:7], v[2:3], off +; GFX9-NEXT: s_waitcnt vmcnt(0) +; GFX9-NEXT: global_load_dwordx2 v[5:6], v[2:3], off +; GFX9-NEXT: ; kill: killed $vgpr0 killed $vgpr1 ; GFX9-NEXT: v_mov_b32_e32 v0, 0xffff -; GFX9-NEXT: s_waitcnt vmcnt(1) +; GFX9-NEXT: ; kill: killed $vgpr2 killed $vgpr3 ; GFX9-NEXT: v_and_b32_e32 v1, v0, v4 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_and_b32_sdwa v2, v0, v6 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 -; GFX9-NEXT: v_lshl_or_b32 v0, v6, 16, v1 -; GFX9-NEXT: v_lshl_or_b32 v1, v7, 16, v2 +; GFX9-NEXT: v_and_b32_sdwa v2, v0, v5 dst_sel:DWORD dst_unused:UNUSED_PAD src0_sel:DWORD src1_sel:WORD_1 +; GFX9-NEXT: v_lshl_or_b32 v0, v5, 16, v1 +; GFX9-NEXT: v_lshl_or_b32 v1, v6, 16, v2 ; GFX9-NEXT: s_setpc_b64 s[30:31] %val0 = load <4 x half>, <4 x half> addrspace(1)* %arg0 %val1 = load <4 x half>, <4 x half> addrspace(1)* %arg1 Index: llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll =================================================================== --- llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll +++ llvm/test/CodeGen/AMDGPU/vgpr-descriptor-waterfall-loop-idom-update.ll @@ -13,17 +13,17 @@ ; GCN-NEXT: s_mov_b32 s5, exec_lo ; GCN-NEXT: v_add_co_ci_u32_e32 v7, vcc_lo, 0, v1, vcc_lo ; GCN-NEXT: s_clause 0x1 -; GCN-NEXT: flat_load_dwordx2 v[4:5], v[6:7] -; GCN-NEXT: flat_load_dwordx2 v[2:3], v[0:1] +; GCN-NEXT: flat_load_dwordx2 v[2:3], v[6:7] +; GCN-NEXT: flat_load_dwordx2 v[4:5], v[0:1] ; GCN-NEXT: BB0_2: ; Parent Loop BB0_1 Depth=1 ; GCN-NEXT: ; => This Inner Loop Header: Depth=2 ; GCN-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-NEXT: v_readfirstlane_b32 s8, v2 -; GCN-NEXT: v_readfirstlane_b32 s9, v3 -; GCN-NEXT: v_readfirstlane_b32 s10, v4 -; GCN-NEXT: v_readfirstlane_b32 s11, v5 -; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[2:3] -; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[4:5] +; GCN-NEXT: v_readfirstlane_b32 s8, v4 +; GCN-NEXT: v_readfirstlane_b32 s9, v5 +; GCN-NEXT: v_readfirstlane_b32 s10, v2 +; GCN-NEXT: v_readfirstlane_b32 s11, v3 +; GCN-NEXT: v_cmp_eq_u64_e32 vcc_lo, s[8:9], v[4:5] +; GCN-NEXT: v_cmp_eq_u64_e64 s4, s[10:11], v[2:3] ; GCN-NEXT: s_and_b32 s4, vcc_lo, s4 ; GCN-NEXT: s_and_saveexec_b32 s4, s4 ; GCN-NEXT: s_nop 0