diff --git a/llvm/include/llvm/CodeGen/TargetInstrInfo.h b/llvm/include/llvm/CodeGen/TargetInstrInfo.h --- a/llvm/include/llvm/CodeGen/TargetInstrInfo.h +++ b/llvm/include/llvm/CodeGen/TargetInstrInfo.h @@ -1044,16 +1044,6 @@ return isCopyInstrImpl(MI); } - bool isFullCopyInstr(const MachineInstr &MI) const { - auto DestSrc = isCopyInstr(MI); - if (!DestSrc) - return false; - - const MachineOperand *DestRegOp = DestSrc->Destination; - const MachineOperand *SrcRegOp = DestSrc->Source; - return !DestRegOp->getSubReg() && !SrcRegOp->getSubReg(); - } - /// If the specific machine instruction is an instruction that adds an /// immediate value and a physical register, and stores the result in /// the given physical register \c Reg, return a pair of the source @@ -1968,13 +1958,6 @@ return false; } - /// Allows targets to use appropriate copy instruction while spilitting live - /// range of a register in register allocation. - virtual unsigned getLiveRangeSplitOpcode(Register Reg, - const MachineFunction &MF) const { - return TargetOpcode::COPY; - } - /// During PHI eleimination lets target to make necessary checks and /// insert the copy to the PHI destination register in a target specific /// manner. diff --git a/llvm/lib/CodeGen/CalcSpillWeights.cpp b/llvm/lib/CodeGen/CalcSpillWeights.cpp --- a/llvm/lib/CodeGen/CalcSpillWeights.cpp +++ b/llvm/lib/CodeGen/CalcSpillWeights.cpp @@ -97,7 +97,7 @@ // Trace copies introduced by live range splitting. The inline // spiller can rematerialize through these copies, so the spill // weight must reflect this. - while (TII.isFullCopyInstr(*MI)) { + while (MI->isFullCopy()) { // The copy destination must match the interval register. if (MI->getOperand(0).getReg() != Reg) return false; @@ -224,16 +224,7 @@ continue; NumInstr++; - bool identityCopy = false; - auto DestSrc = TII.isCopyInstr(*MI); - if (DestSrc) { - const MachineOperand *DestRegOp = DestSrc->Destination; - const MachineOperand *SrcRegOp = DestSrc->Source; - identityCopy = DestRegOp->getReg() == SrcRegOp->getReg() && - DestRegOp->getSubReg() == SrcRegOp->getSubReg(); - } - - if (identityCopy || MI->isImplicitDef()) + if (MI->isIdentityCopy() || MI->isImplicitDef()) continue; if (!Visited.insert(MI).second) continue; @@ -267,7 +258,7 @@ } // Get allocation hints from copies. - if (!TII.isCopyInstr(*MI)) + if (!MI->isCopy()) continue; Register HintReg = copyHint(MI, LI.reg(), TRI, MRI); if (!HintReg) diff --git a/llvm/lib/CodeGen/InlineSpiller.cpp b/llvm/lib/CodeGen/InlineSpiller.cpp --- a/llvm/lib/CodeGen/InlineSpiller.cpp +++ b/llvm/lib/CodeGen/InlineSpiller.cpp @@ -256,11 +256,11 @@ // This minimizes register pressure and maximizes the store-to-load distance for // spill slots which can be important in tight loops. -/// isFullCopyOf - If MI is a COPY to or from Reg, return the other register, -/// otherwise return 0. -static Register isCopyOf(const MachineInstr &MI, Register Reg, - const TargetInstrInfo &TII) { - if (!TII.isCopyInstr(MI)) +/// If MI is a COPY to or from Reg, return the other register, otherwise return +/// 0. +static Register isCopyOf(const MachineInstr &MI, Register Reg) { + assert(!MI.isBundled()); + if (!MI.isCopy()) return Register(); const MachineOperand &DstOp = MI.getOperand(0); @@ -277,10 +277,9 @@ } /// Check for a copy bundle as formed by SplitKit. -static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg, - const TargetInstrInfo &TII) { +static Register isCopyOfBundle(const MachineInstr &FirstMI, Register Reg) { if (!FirstMI.isBundled()) - return isCopyOf(FirstMI, Reg, TII); + return isCopyOf(FirstMI, Reg); assert(!FirstMI.isBundledWithPred() && FirstMI.isBundledWithSucc() && "expected to see first instruction in bundle"); @@ -289,12 +288,11 @@ MachineBasicBlock::const_instr_iterator I = FirstMI.getIterator(); while (I->isBundledWithSucc()) { const MachineInstr &MI = *I; - auto CopyInst = TII.isCopyInstr(MI); - if (!CopyInst) + if (!MI.isCopy()) return Register(); - const MachineOperand &DstOp = *CopyInst->Destination; - const MachineOperand &SrcOp = *CopyInst->Source; + const MachineOperand &DstOp = MI.getOperand(0); + const MachineOperand &SrcOp = MI.getOperand(1); if (DstOp.getReg() == Reg) { if (!SnipReg) SnipReg = SrcOp.getReg(); @@ -360,7 +358,7 @@ MachineInstr &MI = *RI++; // Allow copies to/from Reg. - if (isCopyOfBundle(MI, Reg, TII)) + if (isCopyOfBundle(MI, Reg)) continue; // Allow stack slot loads. @@ -398,7 +396,7 @@ return; for (MachineInstr &MI : llvm::make_early_inc_range(MRI.reg_bundles(Reg))) { - Register SnipReg = isCopyOfBundle(MI, Reg, TII); + Register SnipReg = isCopyOfBundle(MI, Reg); if (!isSibling(SnipReg)) continue; LiveInterval &SnipLI = LIS.getInterval(SnipReg); @@ -521,14 +519,14 @@ // Find all spills and copies of VNI. for (MachineInstr &MI : llvm::make_early_inc_range(MRI.use_nodbg_bundles(Reg))) { - if (!MI.mayStore() && !TII.isCopyInstr(MI)) + if (!MI.isCopy() && !MI.mayStore()) continue; SlotIndex Idx = LIS.getInstructionIndex(MI); if (LI->getVNInfoAt(Idx) != VNI) continue; // Follow sibling copies down the dominator tree. - if (Register DstReg = isCopyOfBundle(MI, Reg, TII)) { + if (Register DstReg = isCopyOfBundle(MI, Reg)) { if (isSibling(DstReg)) { LiveInterval &DstLI = LIS.getInterval(DstReg); VNInfo *DstVNI = DstLI.getVNInfoAt(Idx.getRegSlot()); @@ -872,7 +870,7 @@ if (Ops.back().first != MI || MI->isBundled()) return false; - bool WasCopy = TII.isCopyInstr(*MI).has_value(); + bool WasCopy = MI->isCopy(); Register ImpReg; // TII::foldMemoryOperand will do what we need here for statepoint @@ -1157,7 +1155,7 @@ Idx = VNI->def; // Check for a sibling copy. - Register SibReg = isCopyOfBundle(MI, Reg, TII); + Register SibReg = isCopyOfBundle(MI, Reg); if (SibReg && isSibling(SibReg)) { // This may actually be a copy between snippets. if (isRegToSpill(SibReg)) { diff --git a/llvm/lib/CodeGen/LiveRangeEdit.cpp b/llvm/lib/CodeGen/LiveRangeEdit.cpp --- a/llvm/lib/CodeGen/LiveRangeEdit.cpp +++ b/llvm/lib/CodeGen/LiveRangeEdit.cpp @@ -352,8 +352,7 @@ // unlikely to change anything. We typically don't want to shrink the // PIC base register that has lots of uses everywhere. // Always shrink COPY uses that probably come from live range splitting. - if ((MI->readsVirtualRegister(Reg) && - (MO.isDef() || TII.isCopyInstr(*MI))) || + if ((MI->readsVirtualRegister(Reg) && (MI->isCopy() || MO.isDef())) || (MO.readsReg() && (MRI.hasOneNonDBGUse(Reg) || useIsKill(LI, MO)))) ToShrink.insert(&LI); else if (MO.readsReg()) diff --git a/llvm/lib/CodeGen/LiveRangeShrink.cpp b/llvm/lib/CodeGen/LiveRangeShrink.cpp --- a/llvm/lib/CodeGen/LiveRangeShrink.cpp +++ b/llvm/lib/CodeGen/LiveRangeShrink.cpp @@ -23,7 +23,6 @@ #include "llvm/CodeGen/MachineInstr.h" #include "llvm/CodeGen/MachineOperand.h" #include "llvm/CodeGen/MachineRegisterInfo.h" -#include "llvm/CodeGen/TargetInstrInfo.h" #include "llvm/InitializePasses.h" #include "llvm/Pass.h" #include "llvm/Support/Debug.h" @@ -110,7 +109,6 @@ return false; MachineRegisterInfo &MRI = MF.getRegInfo(); - const TargetInstrInfo &TII = *MF.getSubtarget().getInstrInfo(); LLVM_DEBUG(dbgs() << "**** Analysing " << MF.getName() << '\n'); @@ -199,7 +197,7 @@ // is because it needs more accurate model to handle register // pressure correctly. MachineInstr &DefInstr = *MRI.def_instr_begin(Reg); - if (!TII.isCopyInstr(DefInstr)) + if (!DefInstr.isCopy()) NumEligibleUse++; Insert = FindDominatedInstruction(DefInstr, Insert, IOM); } else { diff --git a/llvm/lib/CodeGen/RegAllocGreedy.cpp b/llvm/lib/CodeGen/RegAllocGreedy.cpp --- a/llvm/lib/CodeGen/RegAllocGreedy.cpp +++ b/llvm/lib/CodeGen/RegAllocGreedy.cpp @@ -1282,12 +1282,10 @@ /// VirtReg. static bool readsLaneSubset(const MachineRegisterInfo &MRI, const MachineInstr *MI, const LiveInterval &VirtReg, - const TargetRegisterInfo *TRI, SlotIndex Use, - const TargetInstrInfo *TII) { + const TargetRegisterInfo *TRI, SlotIndex Use) { // Early check the common case. - auto DestSrc = TII->isCopyInstr(*MI); - if (DestSrc && - DestSrc->Destination->getSubReg() == DestSrc->Source->getSubReg()) + if (MI->isCopy() && + MI->getOperand(0).getSubReg() == MI->getOperand(1).getSubReg()) return false; // FIXME: We're only considering uses, but should be consider defs too? @@ -1346,14 +1344,14 @@ // the allocation. for (const SlotIndex Use : Uses) { if (const MachineInstr *MI = Indexes->getInstructionFromIndex(Use)) { - if (TII->isFullCopyInstr(*MI) || + if (MI->isFullCopy() || (SplitSubClass && SuperRCNumAllocatableRegs == getNumAllocatableRegsForConstraints(MI, VirtReg.reg(), SuperRC, TII, TRI, RegClassInfo)) || // TODO: Handle split for subranges with subclass constraints? (!SplitSubClass && VirtReg.hasSubRanges() && - !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use, TII))) { + !readsLaneSubset(*MRI, MI, VirtReg, TRI, Use))) { LLVM_DEBUG(dbgs() << " skip:\t" << Use << '\t' << *MI); continue; } @@ -2140,7 +2138,7 @@ /// \p Out is not cleared before being populated. void RAGreedy::collectHintInfo(Register Reg, HintsInfo &Out) { for (const MachineInstr &Instr : MRI->reg_nodbg_instructions(Reg)) { - if (!TII->isFullCopyInstr(Instr)) + if (!Instr.isFullCopy()) continue; // Look for the other end of the copy. Register OtherReg = Instr.getOperand(0).getReg(); @@ -2455,10 +2453,9 @@ MI.getOpcode() == TargetOpcode::STATEPOINT; }; for (MachineInstr &MI : MBB) { - auto DestSrc = TII->isCopyInstr(MI); - if (DestSrc) { - const MachineOperand &Dest = *DestSrc->Destination; - const MachineOperand &Src = *DestSrc->Source; + if (MI.isCopy()) { + const MachineOperand &Dest = MI.getOperand(0); + const MachineOperand &Src = MI.getOperand(1); Register SrcReg = Src.getReg(); Register DestReg = Dest.getReg(); // Only count `COPY`s with a virtual register as source or destination. diff --git a/llvm/lib/CodeGen/SplitKit.h b/llvm/lib/CodeGen/SplitKit.h --- a/llvm/lib/CodeGen/SplitKit.h +++ b/llvm/lib/CodeGen/SplitKit.h @@ -428,11 +428,8 @@ bool Late, unsigned RegIdx); SlotIndex buildSingleSubRegCopy(Register FromReg, Register ToReg, - MachineBasicBlock &MB, - MachineBasicBlock::iterator InsertBefore, - unsigned SubIdx, LiveInterval &DestLI, - bool Late, SlotIndex Def, - const MCInstrDesc &Desc); + MachineBasicBlock &MB, MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def); public: /// Create a new SplitEditor for editing the LiveInterval analyzed by SA. diff --git a/llvm/lib/CodeGen/SplitKit.cpp b/llvm/lib/CodeGen/SplitKit.cpp --- a/llvm/lib/CodeGen/SplitKit.cpp +++ b/llvm/lib/CodeGen/SplitKit.cpp @@ -514,10 +514,10 @@ VFP = ValueForcePair(nullptr, true); } -SlotIndex SplitEditor::buildSingleSubRegCopy( - Register FromReg, Register ToReg, MachineBasicBlock &MBB, - MachineBasicBlock::iterator InsertBefore, unsigned SubIdx, - LiveInterval &DestLI, bool Late, SlotIndex Def, const MCInstrDesc &Desc) { +SlotIndex SplitEditor::buildSingleSubRegCopy(Register FromReg, Register ToReg, + MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, + unsigned SubIdx, LiveInterval &DestLI, bool Late, SlotIndex Def) { + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); bool FirstCopy = !Def.isValid(); MachineInstr *CopyMI = BuildMI(MBB, InsertBefore, DebugLoc(), Desc) .addReg(ToReg, RegState::Define | getUndefRegState(FirstCopy) @@ -536,8 +536,7 @@ SlotIndex SplitEditor::buildCopy(Register FromReg, Register ToReg, LaneBitmask LaneMask, MachineBasicBlock &MBB, MachineBasicBlock::iterator InsertBefore, bool Late, unsigned RegIdx) { - const MCInstrDesc &Desc = - TII.get(TII.getLiveRangeSplitOpcode(FromReg, *MBB.getParent())); + const MCInstrDesc &Desc = TII.get(TargetOpcode::COPY); SlotIndexes &Indexes = *LIS.getSlotIndexes(); if (LaneMask.all() || LaneMask == MRI.getMaxLaneMaskForVReg(FromReg)) { // The full vreg is copied. @@ -565,7 +564,7 @@ SlotIndex Def; for (unsigned BestIdx : SubIndexes) { Def = buildSingleSubRegCopy(FromReg, ToReg, MBB, InsertBefore, BestIdx, - DestLI, Late, Def, Desc); + DestLI, Late, Def); } BumpPtrAllocator &Allocator = LIS.getVNInfoAllocator(); @@ -1585,9 +1584,7 @@ if (BI.LiveIn && BI.LiveOut) return true; // No point in isolating a copy. It has no register class constraints. - MachineInstr *MI = LIS.getInstructionFromIndex(BI.FirstInstr); - bool copyLike = TII.isCopyInstr(*MI) || MI->isSubregToReg(); - if (copyLike) + if (LIS.getInstructionFromIndex(BI.FirstInstr)->isCopyLike()) return false; // Finally, don't isolate an end point that was created by earlier splits. return isOriginalEndpoint(BI.FirstInstr); diff --git a/llvm/lib/CodeGen/TargetInstrInfo.cpp b/llvm/lib/CodeGen/TargetInstrInfo.cpp --- a/llvm/lib/CodeGen/TargetInstrInfo.cpp +++ b/llvm/lib/CodeGen/TargetInstrInfo.cpp @@ -440,9 +440,8 @@ // If the COPY instruction in MI can be folded to a stack operation, return // the register class to use. static const TargetRegisterClass *canFoldCopy(const MachineInstr &MI, - const TargetInstrInfo &TII, unsigned FoldIdx) { - assert(TII.isCopyInstr(MI) && "MI must be a COPY instruction"); + assert(MI.isCopy() && "MI must be a COPY instruction"); if (MI.getNumOperands() != 2) return nullptr; assert(FoldIdx<2 && "FoldIdx refers no nonexistent operand"); @@ -631,10 +630,10 @@ } // Straight COPY may fold as load/store. - if (!isCopyInstr(MI) || Ops.size() != 1) + if (!MI.isCopy() || Ops.size() != 1) return nullptr; - const TargetRegisterClass *RC = canFoldCopy(MI, *this, Ops[0]); + const TargetRegisterClass *RC = canFoldCopy(MI, Ops[0]); if (!RC) return nullptr; diff --git a/llvm/lib/Target/AMDGPU/AMDGPU.h b/llvm/lib/Target/AMDGPU/AMDGPU.h --- a/llvm/lib/Target/AMDGPU/AMDGPU.h +++ b/llvm/lib/Target/AMDGPU/AMDGPU.h @@ -41,7 +41,6 @@ FunctionPass *createSIOptimizeExecMaskingPreRAPass(); FunctionPass *createSIOptimizeVGPRLiveRangePass(); FunctionPass *createSIFixSGPRCopiesPass(); -FunctionPass *createLowerWWMCopiesPass(); FunctionPass *createSIMemoryLegalizerPass(); FunctionPass *createSIInsertWaitcntsPass(); FunctionPass *createSIPreAllocateWWMRegsPass(); @@ -145,9 +144,6 @@ void initializeSIFixVGPRCopiesPass(PassRegistry &); extern char &SIFixVGPRCopiesID; -void initializeSILowerWWMCopiesPass(PassRegistry &); -extern char &SILowerWWMCopiesID; - void initializeSILowerI1CopiesPass(PassRegistry &); extern char &SILowerI1CopiesID; diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp --- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp +++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp @@ -364,7 +364,6 @@ initializeAMDGPUDAGToDAGISelPass(*PR); initializeGCNDPPCombinePass(*PR); initializeSILowerI1CopiesPass(*PR); - initializeSILowerWWMCopiesPass(*PR); initializeSILowerSGPRSpillsPass(*PR); initializeSIFixSGPRCopiesPass(*PR); initializeSIFixVGPRCopiesPass(*PR); @@ -1297,7 +1296,6 @@ } bool GCNPassConfig::addPreRewrite() { - addPass(&SILowerWWMCopiesID); if (EnableRegReassign) addPass(&GCNNSAReassignID); return true; @@ -1352,8 +1350,6 @@ addPass(&SILowerSGPRSpillsID); addPass(createVGPRAllocPass(false)); - - addPass(&SILowerWWMCopiesID); return true; } diff --git a/llvm/lib/Target/AMDGPU/CMakeLists.txt b/llvm/lib/Target/AMDGPU/CMakeLists.txt --- a/llvm/lib/Target/AMDGPU/CMakeLists.txt +++ b/llvm/lib/Target/AMDGPU/CMakeLists.txt @@ -145,7 +145,6 @@ SILoadStoreOptimizer.cpp SILowerControlFlow.cpp SILowerI1Copies.cpp - SILowerWWMCopies.cpp SILowerSGPRSpills.cpp SIMachineFunctionInfo.cpp SIMachineScheduler.cpp diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp --- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp +++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp @@ -99,10 +99,10 @@ SGPR, PrologEpilogSGPRSaveRestoreInfo( SGPRSaveKind::SPILL_TO_VGPR_LANE, FI)); - LLVM_DEBUG(auto Spill = MFI->getSGPRSpillToPhysicalVGPRLanes(FI).front(); - dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " - << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane - << '\n';); + LLVM_DEBUG( + auto Spill = MFI->getPrologEpilogSGPRSpillToVGPRLanes(FI).front(); + dbgs() << printReg(SGPR, TRI) << " requires fallback spill to " + << printReg(Spill.VGPR, TRI) << ':' << Spill.Lane << '\n';); } else { // Remove dead index MF.getFrameInfo().RemoveStackObject(FI); @@ -264,7 +264,7 @@ assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef Spill = - FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); + FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); assert(Spill.size() == NumSubRegs); for (unsigned I = 0; I < NumSubRegs; ++I) { @@ -309,7 +309,7 @@ void restoreFromVGPRLane(const int FI) { assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); ArrayRef Spill = - FuncInfo->getSGPRSpillToPhysicalVGPRLanes(FI); + FuncInfo->getPrologEpilogSGPRSpillToVGPRLanes(FI); assert(Spill.size() == NumSubRegs); for (unsigned I = 0; I < NumSubRegs; ++I) { @@ -1353,8 +1353,8 @@ if (FuncInfo->allocateVGPRSpillToAGPR(MF, FI, TRI->isAGPR(MRI, VReg))) { assert(RS != nullptr); - RS->enterBasicBlockEnd(MBB); - RS->backward(MI); + // FIXME: change to enterBasicBlockEnd() + RS->enterBasicBlock(MBB); TRI->eliminateFrameIndex(MI, 0, FIOp, RS); SpillFIs.set(FI); continue; diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.h b/llvm/lib/Target/AMDGPU/SIInstrInfo.h --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.h +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.h @@ -170,12 +170,6 @@ Register findUsedSGPR(const MachineInstr &MI, int OpIndices[3]) const; protected: - /// If the specific machine instruction is a instruction that moves/copies - /// value from one register to another register return destination and source - /// registers as machine operands. - std::optional - isCopyInstrImpl(const MachineInstr &MI) const override; - bool swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, MachineOperand &Src1, unsigned Src1OpName) const; @@ -833,7 +827,7 @@ } bool isVGPRCopy(const MachineInstr &MI) const { - assert(isCopyInstr(MI)); + assert(MI.isCopy()); Register Dest = MI.getOperand(0).getReg(); const MachineFunction &MF = *MI.getParent()->getParent(); const MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -903,7 +897,7 @@ if (OpIdx >= MI.getDesc().NumOperands) return false; - if (isCopyInstr(MI)) { + if (MI.isCopy()) { unsigned Size = getOpSize(MI, OpIdx); assert(Size == 8 || Size == 4); @@ -952,12 +946,12 @@ void insertScratchExecCopy(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg, bool IsSCCLive, - SlotIndexes *Indexes = nullptr) const; + const DebugLoc &DL, Register Reg, + bool IsSCCLive) const; void restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, - Register Reg, SlotIndexes *Indexes = nullptr) const; + Register Reg) const; /// Return the correct register class for \p OpNo. For target-specific /// instructions, this will return the register class that has been defined @@ -1149,9 +1143,6 @@ CreateTargetMIHazardRecognizer(const InstrItineraryData *II, const ScheduleDAGMI *DAG) const override; - unsigned getLiveRangeSplitOpcode(Register Reg, - const MachineFunction &MF) const override; - bool isBasicBlockPrologue(const MachineInstr &MI) const override; MachineInstr *createPHIDestinationCopy(MachineBasicBlock &MBB, diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp @@ -2414,14 +2414,6 @@ return std::pair(Split[0], Split[1]); } -std::optional -SIInstrInfo::isCopyInstrImpl(const MachineInstr &MI) const { - if (MI.getOpcode() == AMDGPU::WWM_COPY) - return DestSourcePair{MI.getOperand(0), MI.getOperand(1)}; - - return std::nullopt; -} - bool SIInstrInfo::swapSourceModifiers(MachineInstr &MI, MachineOperand &Src0, unsigned Src0OpName, @@ -3088,7 +3080,6 @@ case AMDGPU::S_MOV_B32: case AMDGPU::S_MOV_B64: case AMDGPU::COPY: - case AMDGPU::WWM_COPY: case AMDGPU::V_ACCVGPR_WRITE_B32_e64: case AMDGPU::V_ACCVGPR_READ_B32_e64: case AMDGPU::V_ACCVGPR_MOV_B32: @@ -4978,8 +4969,7 @@ MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, const DebugLoc &DL, Register Reg, - bool IsSCCLive, - SlotIndexes *Indexes) const { + bool IsSCCLive) const { const GCNSubtarget &ST = MF.getSubtarget(); const SIInstrInfo *TII = ST.getInstrInfo(); bool IsWave32 = ST.isWave32(); @@ -4989,34 +4979,23 @@ // the single instruction S_OR_SAVEEXEC that clobbers SCC. unsigned MovOpc = IsWave32 ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = IsWave32 ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - auto StoreExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg) - .addReg(Exec, RegState::Kill); - auto FlipExecMI = BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); - if (Indexes) { - Indexes->insertMachineInstrInMaps(*StoreExecMI); - Indexes->insertMachineInstrInMaps(*FlipExecMI); - } + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Reg).addReg(Exec, RegState::Kill); + BuildMI(MBB, MBBI, DL, TII->get(MovOpc), Exec).addImm(-1); } else { const unsigned OrSaveExec = IsWave32 ? AMDGPU::S_OR_SAVEEXEC_B32 : AMDGPU::S_OR_SAVEEXEC_B64; auto SaveExec = BuildMI(MBB, MBBI, DL, TII->get(OrSaveExec), Reg).addImm(-1); SaveExec->getOperand(3).setIsDead(); // Mark SCC as dead. - if (Indexes) - Indexes->insertMachineInstrInMaps(*SaveExec); } } void SIInstrInfo::restoreExec(MachineFunction &MF, MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI, - const DebugLoc &DL, Register Reg, - SlotIndexes *Indexes) const { + const DebugLoc &DL, Register Reg) const { unsigned ExecMov = isWave32() ? AMDGPU::S_MOV_B32 : AMDGPU::S_MOV_B64; MCRegister Exec = isWave32() ? AMDGPU::EXEC_LO : AMDGPU::EXEC; - auto ExecRestoreMI = - BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); - if (Indexes) - Indexes->insertMachineInstrInMaps(*ExecRestoreMI); + BuildMI(MBB, MBBI, DL, get(ExecMov), Exec).addReg(Reg, RegState::Kill); } static const TargetRegisterClass * @@ -8001,16 +7980,6 @@ return ArrayRef(TargetFlags); } -unsigned SIInstrInfo::getLiveRangeSplitOpcode(Register SrcReg, - const MachineFunction &MF) const { - const SIMachineFunctionInfo *MFI = MF.getInfo(); - assert(SrcReg.isVirtual()); - if (MFI->checkFlag(SrcReg, AMDGPU::VirtRegFlag::WWM_REG)) - return AMDGPU::WWM_COPY; - - return AMDGPU::COPY; -} - bool SIInstrInfo::isBasicBlockPrologue(const MachineInstr &MI) const { return !MI.isTerminator() && MI.getOpcode() != AMDGPU::COPY && MI.modifiesRegister(AMDGPU::EXEC, &RI); @@ -8562,7 +8531,7 @@ // A similar issue also exists with spilling and reloading $exec registers. // // To prevent that, constrain the %0 register class here. - if (isFullCopyInstr(MI)) { + if (MI.isFullCopy()) { Register DstReg = MI.getOperand(0).getReg(); Register SrcReg = MI.getOperand(1).getReg(); if ((DstReg.isVirtual() || SrcReg.isVirtual()) && @@ -8659,7 +8628,7 @@ if (opcode == AMDGPU::V_READLANE_B32 || opcode == AMDGPU::V_READFIRSTLANE_B32) return InstructionUniformity::AlwaysUniform; - if (isCopyInstr(MI)) { + if (MI.isCopy()) { const MachineOperand &srcOp = MI.getOperand(1); if (srcOp.isReg() && srcOp.getReg().isPhysical()) { const TargetRegisterClass *regClass = diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td --- a/llvm/lib/Target/AMDGPU/SIInstructions.td +++ b/llvm/lib/Target/AMDGPU/SIInstructions.td @@ -172,13 +172,6 @@ } // End let hasSideEffects = 0, mayLoad = 0, mayStore = 0, Uses = [EXEC] -def WWM_COPY : SPseudoInstSI < - (outs unknown:$dst), (ins unknown:$src)> { - let hasSideEffects = 0; - let isAsCheapAsAMove = 1; - let isConvergent = 1; -} - def ENTER_STRICT_WWM : SPseudoInstSI <(outs SReg_1:$sdst), (ins i64imm:$src0)> { let Uses = [EXEC]; let Defs = [EXEC, SCC]; diff --git a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp --- a/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp +++ b/llvm/lib/Target/AMDGPU/SILowerSGPRSpills.cpp @@ -50,9 +50,7 @@ SILowerSGPRSpills() : MachineFunctionPass(ID) {} void calculateSaveRestoreBlocks(MachineFunction &MF); - bool spillCalleeSavedRegs(MachineFunction &MF, - SmallVectorImpl &CalleeSavedFIs); - void extendWWMVirtRegLiveness(MachineFunction &MF, LiveIntervals *LIS); + bool spillCalleeSavedRegs(MachineFunction &MF); bool runOnMachineFunction(MachineFunction &MF) override; @@ -60,13 +58,6 @@ AU.setPreservesAll(); MachineFunctionPass::getAnalysisUsage(AU); } - - MachineFunctionProperties getClearedProperties() const override { - // SILowerSGPRSpills introduces new Virtual VGPRs for spilling SGPRs. - return MachineFunctionProperties() - .set(MachineFunctionProperties::Property::IsSSA) - .set(MachineFunctionProperties::Property::NoVRegs); - } }; } // end anonymous namespace @@ -206,8 +197,7 @@ EntryBB.sortUniqueLiveIns(); } -bool SILowerSGPRSpills::spillCalleeSavedRegs( - MachineFunction &MF, SmallVectorImpl &CalleeSavedFIs) { +bool SILowerSGPRSpills::spillCalleeSavedRegs(MachineFunction &MF) { MachineRegisterInfo &MRI = MF.getRegInfo(); const Function &F = MF.getFunction(); const GCNSubtarget &ST = MF.getSubtarget(); @@ -238,7 +228,6 @@ TRI->getSpillAlign(*RC), true); CSI.push_back(CalleeSavedInfo(Reg, JunkFI)); - CalleeSavedFIs.push_back(JunkFI); } } @@ -259,50 +248,6 @@ return false; } -void SILowerSGPRSpills::extendWWMVirtRegLiveness(MachineFunction &MF, - LiveIntervals *LIS) { - // TODO: This is a workaround to avoid the unmodelled liveness computed with - // whole-wave virtual registers when allocated together with the regular VGPR - // virtual registers. Presently, the liveness computed during the regalloc is - // only uniform (or single lane aware) and it doesn't take account of the - // divergent control flow that exists for our GPUs. Since the WWM registers - // can modify inactive lanes, the wave-aware liveness should be computed for - // the virtual registers to accurately plot their interferences. Without - // having the divergent CFG for the function, it is difficult to implement the - // wave-aware liveness info. Until then, we conservatively extend the liveness - // of the wwm registers into the entire function so that they won't be reused - // without first spilling/splitting their liveranges. - SIMachineFunctionInfo *MFI = MF.getInfo(); - - // Insert the IMPLICIT_DEF for the wwm-registers in the entry blocks. - for (auto Reg : MFI->getSGPRSpillVGPRs()) { - for (MachineBasicBlock *SaveBlock : SaveBlocks) { - MachineBasicBlock::iterator InsertBefore = SaveBlock->begin(); - auto MIB = BuildMI(*SaveBlock, *InsertBefore, InsertBefore->getDebugLoc(), - TII->get(AMDGPU::IMPLICIT_DEF), Reg); - MFI->setFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG); - if (LIS) { - LIS->InsertMachineInstrInMaps(*MIB); - } - } - } - - // Insert the KILL in the return blocks to extend their liveness untill the - // end of function. Insert a separate KILL for each VGPR. - for (MachineBasicBlock *RestoreBlock : RestoreBlocks) { - MachineBasicBlock::iterator InsertBefore = - RestoreBlock->getFirstTerminator(); - for (auto Reg : MFI->getSGPRSpillVGPRs()) { - auto MIB = - BuildMI(*RestoreBlock, *InsertBefore, InsertBefore->getDebugLoc(), - TII->get(TargetOpcode::KILL)); - MIB.addReg(Reg); - if (LIS) - LIS->InsertMachineInstrInMaps(*MIB); - } - } -} - bool SILowerSGPRSpills::runOnMachineFunction(MachineFunction &MF) { const GCNSubtarget &ST = MF.getSubtarget(); TII = ST.getInstrInfo(); @@ -316,8 +261,7 @@ // First, expose any CSR SGPR spills. This is mostly the same as what PEI // does, but somewhat simpler. calculateSaveRestoreBlocks(MF); - SmallVector CalleeSavedFIs; - bool HasCSRs = spillCalleeSavedRegs(MF, CalleeSavedFIs); + bool HasCSRs = spillCalleeSavedRegs(MF); MachineFrameInfo &MFI = MF.getFrameInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); @@ -331,7 +275,6 @@ bool MadeChange = false; bool NewReservedRegs = false; - bool SpilledToVirtVGPRLanes = false; // TODO: CSR VGPRs will never be spilled to AGPRs. These can probably be // handled as SpilledToReg in regular PrologEpilogInserter. @@ -354,53 +297,23 @@ int FI = TII->getNamedOperand(MI, AMDGPU::OpName::addr)->getIndex(); assert(MFI.getStackID(FI) == TargetStackID::SGPRSpill); - - bool IsCalleeSaveSGPRSpill = - std::find(CalleeSavedFIs.begin(), CalleeSavedFIs.end(), FI) != - CalleeSavedFIs.end(); - if (IsCalleeSaveSGPRSpill) { - // Spill callee-saved SGPRs into physical VGPR lanes. - - // TODO: This is to ensure the CFIs are static for efficient frame - // unwinding in the debugger. Spilling them into virtual VGPR lanes - // involve regalloc to allocate the physical VGPRs and that might - // cause intermediate spill/split of such liveranges for successful - // allocation. This would result in broken CFI encoding unless the - // regalloc aware CFI generation to insert new CFIs along with the - // intermediate spills is implemented. There is no such support - // currently exist in the LLVM compiler. - if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI, true)) { - NewReservedRegs = true; - bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( - MI, FI, nullptr, Indexes, LIS, true); - if (!Spilled) - llvm_unreachable( - "failed to spill SGPR to physical VGPR lane when allocated"); - } - } else { - if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { - bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( - MI, FI, nullptr, Indexes, LIS); - if (!Spilled) - llvm_unreachable( - "failed to spill SGPR to virtual VGPR lane when allocated"); - SpillFIs.set(FI); - SpilledToVirtVGPRLanes = true; - } + if (FuncInfo->allocateSGPRSpillToVGPRLane(MF, FI)) { + NewReservedRegs = true; + bool Spilled = TRI->eliminateSGPRToVGPRSpillFrameIndex( + MI, FI, nullptr, Indexes, LIS); + (void)Spilled; + assert(Spilled && "failed to spill SGPR to VGPR when allocated"); + SpillFIs.set(FI); } } } - if (SpilledToVirtVGPRLanes) { - extendWWMVirtRegLiveness(MF, LIS); - if (LIS) { - // Compute the LiveInterval for the newly created virtual registers. - for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) - LIS->createAndComputeVirtRegInterval(Reg); - } - } - + // FIXME: Adding to live-ins redundant with reserving registers. for (MachineBasicBlock &MBB : MF) { + for (auto Reg : FuncInfo->getSGPRSpillVGPRs()) + MBB.addLiveIn(Reg); + MBB.sortUniqueLiveIns(); + // FIXME: The dead frame indices are replaced with a null register from // the debug value instructions. We should instead, update it with the // correct register value. But not sure the register value alone is @@ -421,10 +334,6 @@ // lane". FuncInfo->removeDeadFrameIndices(MFI, /*ResetSGPRSpillStackIDs*/ false); - MadeChange = true; - } - - if (SpilledToVirtVGPRLanes) { const TargetRegisterClass *RC = TRI->getWaveMaskRegClass(); // Shift back the reserved SGPR for EXEC copy into the lowest range. // This SGPR is reserved to handle the whole-wave spill/copy operations @@ -433,21 +342,20 @@ if (UnusedLowSGPR && TRI->getHWRegIndex(UnusedLowSGPR) < TRI->getHWRegIndex(FuncInfo->getSGPRForEXECCopy())) FuncInfo->setSGPRForEXECCopy(UnusedLowSGPR); + + MadeChange = true; } else { - // No SGPR spills to virtual VGPR lanes and hence there won't be any WWM - // spills/copies. Reset the SGPR reserved for EXEC copy. + // No SGPR spills and hence there won't be any WWM spills/copies. Reset the + // SGPR reserved for EXEC copy. FuncInfo->setSGPRForEXECCopy(AMDGPU::NoRegister); } SaveBlocks.clear(); RestoreBlocks.clear(); - // Updated the reserved registers with any physical VGPRs added for SGPR - // spills. - if (NewReservedRegs) { - for (Register Reg : FuncInfo->getWWMReservedRegs()) - MRI.reserveReg(Reg, TRI); - } + // Updated the reserved registers with any VGPRs added for SGPR spills. + if (NewReservedRegs) + MRI.freezeReservedRegs(MF); return MadeChange; } diff --git a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp b/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp deleted file mode 100644 --- a/llvm/lib/Target/AMDGPU/SILowerWWMCopies.cpp +++ /dev/null @@ -1,141 +0,0 @@ -//===-- SILowerWWMCopies.cpp - Lower Copies after regalloc ---===// -// -// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. -// See https://llvm.org/LICENSE.txt for license information. -// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception -// -//===----------------------------------------------------------------------===// -// -/// \file -/// Lowering the WWM_COPY instructions for various register classes. -/// AMDGPU target generates WWM_COPY instruction to differentiate WWM -/// copy from COPY. This pass generates the necessary exec mask manipulation -/// instructions to replicate 'Whole Wave Mode' and lowers WWM_COPY back to -/// COPY. -// -//===----------------------------------------------------------------------===// - -#include "AMDGPU.h" -#include "GCNSubtarget.h" -#include "MCTargetDesc/AMDGPUMCTargetDesc.h" -#include "SIMachineFunctionInfo.h" -#include "llvm/CodeGen/LiveIntervals.h" -#include "llvm/CodeGen/MachineFunctionPass.h" -#include "llvm/CodeGen/VirtRegMap.h" -#include "llvm/InitializePasses.h" - -using namespace llvm; - -#define DEBUG_TYPE "si-lower-wwm-copies" - -namespace { - -class SILowerWWMCopies : public MachineFunctionPass { -public: - static char ID; - - SILowerWWMCopies() : MachineFunctionPass(ID) { - initializeSILowerWWMCopiesPass(*PassRegistry::getPassRegistry()); - } - - bool runOnMachineFunction(MachineFunction &MF) override; - - StringRef getPassName() const override { return "SI Lower WWM Copies"; } - - void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.setPreservesAll(); - MachineFunctionPass::getAnalysisUsage(AU); - } - -private: - bool isSCCLiveAtMI(const MachineInstr &MI); - void addToWWMSpills(MachineFunction &MF, Register Reg); - - LiveIntervals *LIS; - SlotIndexes *Indexes; - VirtRegMap *VRM; - const SIRegisterInfo *TRI; - const MachineRegisterInfo *MRI; - SIMachineFunctionInfo *MFI; -}; - -} // End anonymous namespace. - -INITIALIZE_PASS_BEGIN(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", - false, false) -INITIALIZE_PASS_DEPENDENCY(LiveIntervals) -INITIALIZE_PASS_DEPENDENCY(VirtRegMap) -INITIALIZE_PASS_END(SILowerWWMCopies, DEBUG_TYPE, "SI Lower WWM Copies", false, - false) - -char SILowerWWMCopies::ID = 0; - -char &llvm::SILowerWWMCopiesID = SILowerWWMCopies::ID; - -bool SILowerWWMCopies::isSCCLiveAtMI(const MachineInstr &MI) { - // We can't determine the liveness info if LIS isn't available. Early return - // in that case and always assume SCC is live. - if (!LIS) - return true; - - LiveRange &LR = - LIS->getRegUnit(*MCRegUnitIterator(MCRegister::from(AMDGPU::SCC), TRI)); - SlotIndex Idx = LIS->getInstructionIndex(MI); - return LR.liveAt(Idx); -} - -// If \p Reg is assigned with a physical VGPR, add the latter into wwm-spills -// for preserving its entire lanes at function prolog/epilog. -void SILowerWWMCopies::addToWWMSpills(MachineFunction &MF, Register Reg) { - if (Reg.isPhysical()) - return; - - Register PhysReg = VRM->getPhys(Reg); - assert(PhysReg != VirtRegMap::NO_PHYS_REG && - "should have allocated a physical register"); - - MFI->allocateWWMSpill(MF, PhysReg); -} - -bool SILowerWWMCopies::runOnMachineFunction(MachineFunction &MF) { - const GCNSubtarget &ST = MF.getSubtarget(); - const SIInstrInfo *TII = ST.getInstrInfo(); - - MFI = MF.getInfo(); - LIS = getAnalysisIfAvailable(); - Indexes = getAnalysisIfAvailable(); - VRM = getAnalysisIfAvailable(); - TRI = ST.getRegisterInfo(); - MRI = &MF.getRegInfo(); - - if (!MFI->hasVRegFlags()) - return false; - - bool Changed = false; - for (MachineBasicBlock &MBB : MF) { - for (MachineInstr &MI : MBB) { - if (MI.getOpcode() != AMDGPU::WWM_COPY) - continue; - - // TODO: Club adjacent WWM ops between same exec save/restore - assert(TII->isVGPRCopy(MI)); - - // For WWM vector copies, manipulate the exec mask around the copy - // instruction. - const DebugLoc &DL = MI.getDebugLoc(); - MachineBasicBlock::iterator InsertPt = MI.getIterator(); - Register RegForExecCopy = MFI->getSGPRForEXECCopy(); - TII->insertScratchExecCopy(MF, MBB, InsertPt, DL, RegForExecCopy, - isSCCLiveAtMI(MI), Indexes); - TII->restoreExec(MF, MBB, ++InsertPt, DL, RegForExecCopy, Indexes); - addToWWMSpills(MF, MI.getOperand(0).getReg()); - LLVM_DEBUG(dbgs() << "WWM copy manipulation for " << MI); - - // Lower WWM_COPY back to COPY - MI.setDesc(TII->get(AMDGPU::COPY)); - Changed |= true; - } - } - - return Changed; -} diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.h @@ -496,16 +496,15 @@ }; private: - // To track virtual VGPR + lane index for each subregister of the SGPR spilled - // to frameindex key during SILowerSGPRSpills pass. + // To track VGPR + lane index for each subregister of the SGPR spilled to + // frameindex key during SILowerSGPRSpills pass. + DenseMap> SGPRSpillToVGPRLanes; + // To track VGPR + lane index for spilling special SGPRs like Frame Pointer + // identified during PrologEpilogInserter. DenseMap> - SGPRSpillsToVirtualVGPRLanes; - // To track physical VGPR + lane index for CSR SGPR spills and special SGPRs - // like Frame Pointer identified during PrologEpilogInserter. - DenseMap> - SGPRSpillsToPhysicalVGPRLanes; - unsigned NumVirtualVGPRSpillLanes = 0; - unsigned NumPhysicalVGPRSpillLanes = 0; + PrologEpilogSGPRSpillToVGPRLanes; + unsigned NumVGPRSpillLanes = 0; + unsigned NumVGPRPrologEpilogSpillLanes = 0; SmallVector SpillVGPRs; using WWMSpillsMap = MapVector; // To track the registers used in instructions that can potentially modify the @@ -549,10 +548,10 @@ private: Register VGPRForAGPRCopy; - bool allocateVirtualVGPRForSGPRSpills(MachineFunction &MF, int FI, - unsigned LaneIndex); - bool allocatePhysicalVGPRForSGPRSpills(MachineFunction &MF, int FI, - unsigned LaneIndex); + bool allocateVGPRForSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); + bool allocateVGPRForPrologEpilogSGPRSpills(MachineFunction &MF, int FI, + unsigned LaneIndex); public: Register getVGPRForAGPRCopy() const { @@ -584,9 +583,9 @@ SIModeRegisterDefaults getMode() const { return Mode; } ArrayRef - getSGPRSpillToVirtualVGPRLanes(int FrameIndex) const { - auto I = SGPRSpillsToVirtualVGPRLanes.find(FrameIndex); - return (I == SGPRSpillsToVirtualVGPRLanes.end()) + getSGPRSpillToVGPRLanes(int FrameIndex) const { + auto I = SGPRSpillToVGPRLanes.find(FrameIndex); + return (I == SGPRSpillToVGPRLanes.end()) ? ArrayRef() : ArrayRef(I->second); } @@ -648,9 +647,9 @@ } ArrayRef - getSGPRSpillToPhysicalVGPRLanes(int FrameIndex) const { - auto I = SGPRSpillsToPhysicalVGPRLanes.find(FrameIndex); - return (I == SGPRSpillsToPhysicalVGPRLanes.end()) + getPrologEpilogSGPRSpillToVGPRLanes(int FrameIndex) const { + auto I = PrologEpilogSGPRSpillToVGPRLanes.find(FrameIndex); + return (I == PrologEpilogSGPRSpillToVGPRLanes.end()) ? ArrayRef() : ArrayRef(I->second); } @@ -668,8 +667,6 @@ return VRegFlags.inBounds(Reg) && VRegFlags[Reg] & Flag; } - bool hasVRegFlags() { return VRegFlags.size(); } - void allocateWWMSpill(MachineFunction &MF, Register VGPR, uint64_t Size = 4, Align Alignment = Align(4)); diff --git a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIMachineFunctionInfo.cpp @@ -314,23 +314,37 @@ return false; } -bool SIMachineFunctionInfo::allocateVirtualVGPRForSGPRSpills( - MachineFunction &MF, int FI, unsigned LaneIndex) { +bool SIMachineFunctionInfo::allocateVGPRForSGPRSpills(MachineFunction &MF, + int FI, + unsigned LaneIndex) { + const GCNSubtarget &ST = MF.getSubtarget(); + const SIRegisterInfo *TRI = ST.getRegisterInfo(); MachineRegisterInfo &MRI = MF.getRegInfo(); Register LaneVGPR; if (!LaneIndex) { - LaneVGPR = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass); + LaneVGPR = TRI->findUnusedRegister(MRI, &AMDGPU::VGPR_32RegClass, MF); + if (LaneVGPR == AMDGPU::NoRegister) { + // We have no VGPRs left for spilling SGPRs. Reset because we will not + // partially spill the SGPR to VGPRs. + SGPRSpillToVGPRLanes.erase(FI); + return false; + } + SpillVGPRs.push_back(LaneVGPR); + // Add this register as live-in to all blocks to avoid machine verifier + // complaining about use of an undefined physical register. + for (MachineBasicBlock &BB : MF) + BB.addLiveIn(LaneVGPR); } else { LaneVGPR = SpillVGPRs.back(); } - SGPRSpillsToVirtualVGPRLanes[FI].push_back( + SGPRSpillToVGPRLanes[FI].push_back( SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); return true; } -bool SIMachineFunctionInfo::allocatePhysicalVGPRForSGPRSpills( +bool SIMachineFunctionInfo::allocateVGPRForPrologEpilogSGPRSpills( MachineFunction &MF, int FI, unsigned LaneIndex) { const GCNSubtarget &ST = MF.getSubtarget(); const SIRegisterInfo *TRI = ST.getRegisterInfo(); @@ -341,21 +355,16 @@ if (LaneVGPR == AMDGPU::NoRegister) { // We have no VGPRs left for spilling SGPRs. Reset because we will not // partially spill the SGPR to VGPRs. - SGPRSpillsToPhysicalVGPRLanes.erase(FI); + PrologEpilogSGPRSpillToVGPRLanes.erase(FI); return false; } allocateWWMSpill(MF, LaneVGPR); - reserveWWMRegister(LaneVGPR); - for (MachineBasicBlock &MBB : MF) { - MBB.addLiveIn(LaneVGPR); - MBB.sortUniqueLiveIns(); - } } else { - LaneVGPR = WWMReservedRegs.back(); + LaneVGPR = WWMSpills.back().first; } - SGPRSpillsToPhysicalVGPRLanes[FI].push_back( + PrologEpilogSGPRSpillToVGPRLanes[FI].push_back( SIRegisterInfo::SpilledReg(LaneVGPR, LaneIndex)); return true; } @@ -364,8 +373,8 @@ int FI, bool IsPrologEpilog) { std::vector &SpillLanes = - IsPrologEpilog ? SGPRSpillsToPhysicalVGPRLanes[FI] - : SGPRSpillsToVirtualVGPRLanes[FI]; + IsPrologEpilog ? PrologEpilogSGPRSpillToVGPRLanes[FI] + : SGPRSpillToVGPRLanes[FI]; // This has already been allocated. if (!SpillLanes.empty()) @@ -386,14 +395,15 @@ "not spilling SGPRs to VGPRs"); unsigned &NumSpillLanes = - IsPrologEpilog ? NumPhysicalVGPRSpillLanes : NumVirtualVGPRSpillLanes; + IsPrologEpilog ? NumVGPRPrologEpilogSpillLanes : NumVGPRSpillLanes; for (unsigned I = 0; I < NumLanes; ++I, ++NumSpillLanes) { unsigned LaneIndex = (NumSpillLanes % WaveSize); - bool Allocated = IsPrologEpilog - ? allocatePhysicalVGPRForSGPRSpills(MF, FI, LaneIndex) - : allocateVirtualVGPRForSGPRSpills(MF, FI, LaneIndex); + bool Allocated = + IsPrologEpilog + ? allocateVGPRForPrologEpilogSGPRSpills(MF, FI, LaneIndex) + : allocateVGPRForSGPRSpills(MF, FI, LaneIndex); if (!Allocated) { NumSpillLanes -= I; return false; @@ -474,25 +484,16 @@ bool SIMachineFunctionInfo::removeDeadFrameIndices( MachineFrameInfo &MFI, bool ResetSGPRSpillStackIDs) { - // Remove dead frame indices from function frame, however keep FP & BP since - // spills for them haven't been inserted yet. And also make sure to remove the - // frame indices from `SGPRSpillsToVirtualVGPRLanes` data structure, - // otherwise, it could result in an unexpected side effect and bug, in case of - // any re-mapping of freed frame indices by later pass(es) like "stack slot + // Remove dead frame indices from function frame. And also make sure to remove + // the frame indices from `SGPRSpillToVGPRLanes` data structure, otherwise, it + // could result in an unexpected side effect and bug, in case of any + // re-mapping of freed frame indices by later pass(es) like "stack slot // coloring". - for (auto &R : make_early_inc_range(SGPRSpillsToVirtualVGPRLanes)) { + for (auto &R : make_early_inc_range(SGPRSpillToVGPRLanes)) { MFI.RemoveStackObject(R.first); - SGPRSpillsToVirtualVGPRLanes.erase(R.first); + SGPRSpillToVGPRLanes.erase(R.first); } - // Remove the dead frame indices of CSR SGPRs which are spilled to physical - // VGPR lanes during SILowerSGPRSpills pass. - if (!ResetSGPRSpillStackIDs) { - for (auto &R : make_early_inc_range(SGPRSpillsToPhysicalVGPRLanes)) { - MFI.RemoveStackObject(R.first); - SGPRSpillsToPhysicalVGPRLanes.erase(R.first); - } - } bool HaveSGPRToMemory = false; if (ResetSGPRSpillStackIDs) { diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h @@ -142,17 +142,14 @@ void buildVGPRSpillLoadStore(SGPRSpillBuilder &SB, int Index, int Offset, bool IsLoad, bool IsKill = true) const; - /// If \p OnlyToVGPR is true, this will only succeed if this manages to find a - /// free VGPR lane to spill. + /// If \p OnlyToVGPR is true, this will only succeed if this bool spillSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, - bool OnlyToVGPR = false, - bool SpillToPhysVGPRLane = false) const; + bool OnlyToVGPR = false) const; bool restoreSGPR(MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, - bool OnlyToVGPR = false, - bool SpillToPhysVGPRLane = false) const; + bool OnlyToVGPR = false) const; bool spillEmergencySGPR(MachineBasicBlock::iterator MI, MachineBasicBlock &RestoreMBB, Register SGPR, @@ -166,10 +163,10 @@ unsigned FIOperandNum, RegScavenger *RS) const override; - bool eliminateSGPRToVGPRSpillFrameIndex( - MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, - SlotIndexes *Indexes = nullptr, LiveIntervals *LIS = nullptr, - bool SpillToPhysVGPRLane = false) const; + bool eliminateSGPRToVGPRSpillFrameIndex(MachineBasicBlock::iterator MI, + int FI, RegScavenger *RS, + SlotIndexes *Indexes = nullptr, + LiveIntervals *LIS = nullptr) const; StringRef getRegAsmName(MCRegister Reg) const override; diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp --- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp +++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp @@ -712,6 +712,9 @@ for (MCPhysReg Reg : MFI->getVGPRSpillAGPRs()) reserveRegisterTuples(Reserved, Reg); + for (auto Reg : MFI->getSGPRSpillVGPRs()) + reserveRegisterTuples(Reserved, Reg); + return Reserved; } @@ -1733,13 +1736,10 @@ bool SIRegisterInfo::spillSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, SlotIndexes *Indexes, - LiveIntervals *LIS, bool OnlyToVGPR, - bool SpillToPhysVGPRLane) const { + LiveIntervals *LIS, bool OnlyToVGPR) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef VGPRSpills = - SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) - : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); + ArrayRef VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -1856,13 +1856,10 @@ bool SIRegisterInfo::restoreSGPR(MachineBasicBlock::iterator MI, int Index, RegScavenger *RS, SlotIndexes *Indexes, - LiveIntervals *LIS, bool OnlyToVGPR, - bool SpillToPhysVGPRLane) const { + LiveIntervals *LIS, bool OnlyToVGPR) const { SGPRSpillBuilder SB(*this, *ST.getInstrInfo(), isWave32, MI, Index, RS); - ArrayRef VGPRSpills = - SpillToPhysVGPRLane ? SB.MFI.getSGPRSpillToPhysicalVGPRLanes(Index) - : SB.MFI.getSGPRSpillToVirtualVGPRLanes(Index); + ArrayRef VGPRSpills = SB.MFI.getSGPRSpillToVGPRLanes(Index); bool SpillToVGPR = !VGPRSpills.empty(); if (OnlyToVGPR && !SpillToVGPR) return false; @@ -2008,7 +2005,7 @@ /// handled. bool SIRegisterInfo::eliminateSGPRToVGPRSpillFrameIndex( MachineBasicBlock::iterator MI, int FI, RegScavenger *RS, - SlotIndexes *Indexes, LiveIntervals *LIS, bool SpillToPhysVGPRLane) const { + SlotIndexes *Indexes, LiveIntervals *LIS) const { switch (MI->getOpcode()) { case AMDGPU::SI_SPILL_S1024_SAVE: case AMDGPU::SI_SPILL_S512_SAVE: @@ -2024,7 +2021,7 @@ case AMDGPU::SI_SPILL_S96_SAVE: case AMDGPU::SI_SPILL_S64_SAVE: case AMDGPU::SI_SPILL_S32_SAVE: - return spillSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); + return spillSGPR(MI, FI, RS, Indexes, LIS, true); case AMDGPU::SI_SPILL_S1024_RESTORE: case AMDGPU::SI_SPILL_S512_RESTORE: case AMDGPU::SI_SPILL_S384_RESTORE: @@ -2039,7 +2036,7 @@ case AMDGPU::SI_SPILL_S96_RESTORE: case AMDGPU::SI_SPILL_S64_RESTORE: case AMDGPU::SI_SPILL_S32_RESTORE: - return restoreSGPR(MI, FI, RS, Indexes, LIS, true, SpillToPhysVGPRLane); + return restoreSGPR(MI, FI, RS, Indexes, LIS, true); default: llvm_unreachable("not an SGPR spill instruction"); } diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/assert-align.ll @@ -11,12 +11,13 @@ ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 2 ; CHECK-NEXT: s_addk_i32 s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: v_mov_b32_e32 v1, 0 +; CHECK-NEXT: v_writelane_b32 v41, s16, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: s_getpc_b64 s[16:17] ; CHECK-NEXT: s_add_u32 s16, s16, ext@rel32@lo+4 @@ -27,9 +28,10 @@ ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 2 +; CHECK-NEXT: v_readlane_b32 s4, v41, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/call-outgoing-stack-args.ll @@ -228,17 +228,18 @@ ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_addk_i32 s32, 0x400 ; MUBUF-NEXT: v_mov_b32_e32 v0, 9 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; MUBUF-NEXT: v_mov_b32_e32 v0, 10 -; MUBUF-NEXT: v_writelane_b32 v40, s4, 2 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; MUBUF-NEXT: v_mov_b32_e32 v0, 11 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; MUBUF-NEXT: v_mov_b32_e32 v0, 12 +; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:16 ; MUBUF-NEXT: s_getpc_b64 s[4:5] @@ -247,9 +248,10 @@ ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 -; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s4, v41, 0 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; MUBUF-NEXT: s_mov_b32 s33, s4 @@ -263,9 +265,10 @@ ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 -; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 ; FLATSCR-NEXT: s_add_u32 s0, s32, 4 ; FLATSCR-NEXT: v_mov_b32_e32 v0, 9 ; FLATSCR-NEXT: scratch_store_dword off, v0, s0 @@ -286,9 +289,10 @@ ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 -; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s0, v41, 0 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; FLATSCR-NEXT: s_mov_b32 s33, s0 @@ -306,12 +310,13 @@ ; MUBUF-NEXT: s_mov_b32 s33, s32 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: buffer_load_dword v1, v0, s[0:3], 0 offen ; MUBUF-NEXT: buffer_load_dword v2, v0, s[0:3], 0 offen offset:4 ; MUBUF-NEXT: s_addk_i32 s32, 0x400 -; MUBUF-NEXT: v_writelane_b32 v40, s4, 2 ; MUBUF-NEXT: v_writelane_b32 v40, s30, 0 +; MUBUF-NEXT: v_writelane_b32 v41, s4, 0 ; MUBUF-NEXT: v_writelane_b32 v40, s31, 1 ; MUBUF-NEXT: s_getpc_b64 s[4:5] ; MUBUF-NEXT: s_add_u32 s4, s4, external_void_func_byval@rel32@lo+4 @@ -372,9 +377,10 @@ ; MUBUF-NEXT: s_swappc_b64 s[30:31], s[4:5] ; MUBUF-NEXT: v_readlane_b32 s31, v40, 1 ; MUBUF-NEXT: v_readlane_b32 s30, v40, 0 -; MUBUF-NEXT: v_readlane_b32 s4, v40, 2 +; MUBUF-NEXT: v_readlane_b32 s4, v41, 0 ; MUBUF-NEXT: s_or_saveexec_b64 s[6:7], -1 ; MUBUF-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; MUBUF-NEXT: s_mov_b64 exec, s[6:7] ; MUBUF-NEXT: s_addk_i32 s32, 0xfc00 ; MUBUF-NEXT: s_mov_b32 s33, s4 @@ -388,11 +394,12 @@ ; FLATSCR-NEXT: s_mov_b32 s33, s32 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: scratch_load_dwordx2 v[1:2], v0, off ; FLATSCR-NEXT: s_add_i32 s32, s32, 16 ; FLATSCR-NEXT: v_add_u32_e32 v3, 8, v0 -; FLATSCR-NEXT: v_writelane_b32 v40, s0, 2 +; FLATSCR-NEXT: v_writelane_b32 v41, s0, 0 ; FLATSCR-NEXT: s_add_u32 s0, s32, 8 ; FLATSCR-NEXT: v_writelane_b32 v40, s30, 0 ; FLATSCR-NEXT: s_add_u32 s2, s32, 56 @@ -437,9 +444,10 @@ ; FLATSCR-NEXT: s_swappc_b64 s[30:31], s[0:1] ; FLATSCR-NEXT: v_readlane_b32 s31, v40, 1 ; FLATSCR-NEXT: v_readlane_b32 s30, v40, 0 -; FLATSCR-NEXT: v_readlane_b32 s0, v40, 2 +; FLATSCR-NEXT: v_readlane_b32 s0, v41, 0 ; FLATSCR-NEXT: s_or_saveexec_b64 s[2:3], -1 ; FLATSCR-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword v41, off, s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: s_mov_b64 exec, s[2:3] ; FLATSCR-NEXT: s_add_i32 s32, s32, -16 ; FLATSCR-NEXT: s_mov_b32 s33, s0 diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/image-waterfall-loop-O0.ll @@ -12,39 +12,32 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 exec_lo, s4 -; CHECK-NEXT: ; implicit-def: $vgpr8 -; CHECK-NEXT: v_mov_b32_e32 v8, v0 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v15, v1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v14, v2 -; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v13, v3 -; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v12, v4 -; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v12, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v11, v5 -; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v10, v6 -; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v9, v7 -; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8 killed $exec -; CHECK-NEXT: v_mov_b32_e32 v2, v15 -; CHECK-NEXT: v_mov_b32_e32 v3, v14 -; CHECK-NEXT: v_mov_b32_e32 v4, v13 -; CHECK-NEXT: v_mov_b32_e32 v5, v12 -; CHECK-NEXT: v_mov_b32_e32 v6, v11 -; CHECK-NEXT: v_mov_b32_e32 v7, v10 -; CHECK-NEXT: v_mov_b32_e32 v8, v9 -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; CHECK-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7 killed $exec +; CHECK-NEXT: v_mov_b32_e32 v1, v15 +; CHECK-NEXT: v_mov_b32_e32 v2, v14 +; CHECK-NEXT: v_mov_b32_e32 v3, v13 +; CHECK-NEXT: v_mov_b32_e32 v4, v12 +; CHECK-NEXT: v_mov_b32_e32 v5, v11 +; CHECK-NEXT: v_mov_b32_e32 v6, v10 +; CHECK-NEXT: v_mov_b32_e32 v7, v9 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill @@ -52,40 +45,34 @@ ; CHECK-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s8, 0 ; CHECK-NEXT: s_mov_b32 s4, s8 ; CHECK-NEXT: s_mov_b32 s5, s8 ; CHECK-NEXT: s_mov_b32 s6, s8 ; CHECK-NEXT: s_mov_b32 s7, s8 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: v_writelane_b32 v0, s6, 2 -; CHECK-NEXT: v_writelane_b32 v0, s7, 3 +; CHECK-NEXT: v_writelane_b32 v8, s4, 0 +; CHECK-NEXT: v_writelane_b32 v8, s5, 1 +; CHECK-NEXT: v_writelane_b32 v8, s6, 2 +; CHECK-NEXT: v_writelane_b32 v8, s7, 3 ; CHECK-NEXT: s_mov_b32 s6, 0 ; CHECK-NEXT: s_mov_b32 s4, s6 ; CHECK-NEXT: s_mov_b32 s5, s6 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_mov_b32_e32 v2, s5 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 +; CHECK-NEXT: v_mov_b32_e32 v1, s5 +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b32 s4, exec_lo -; CHECK-NEXT: v_writelane_b32 v0, s4, 4 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: v_writelane_b32 v8, s4, 4 ; CHECK-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v12, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v14, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v15, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v16, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload @@ -93,16 +80,15 @@ ; CHECK-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; CHECK-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readfirstlane_b32 s12, v8 -; CHECK-NEXT: v_readfirstlane_b32 s10, v7 -; CHECK-NEXT: v_readfirstlane_b32 s9, v6 -; CHECK-NEXT: v_readfirstlane_b32 s8, v5 -; CHECK-NEXT: v_readfirstlane_b32 s7, v4 -; CHECK-NEXT: v_readfirstlane_b32 s6, v3 -; CHECK-NEXT: v_readfirstlane_b32 s5, v2 -; CHECK-NEXT: v_readfirstlane_b32 s4, v1 +; CHECK-NEXT: v_readfirstlane_b32 s12, v7 +; CHECK-NEXT: v_readfirstlane_b32 s10, v6 +; CHECK-NEXT: v_readfirstlane_b32 s9, v5 +; CHECK-NEXT: v_readfirstlane_b32 s8, v4 +; CHECK-NEXT: v_readfirstlane_b32 s7, v3 +; CHECK-NEXT: v_readfirstlane_b32 s6, v2 +; CHECK-NEXT: v_readfirstlane_b32 s5, v1 +; CHECK-NEXT: v_readfirstlane_b32 s4, v0 ; CHECK-NEXT: ; kill: def $sgpr12 killed $sgpr12 def $sgpr12_sgpr13_sgpr14_sgpr15_sgpr16_sgpr17_sgpr18_sgpr19 ; CHECK-NEXT: s_mov_b32 s13, s10 ; CHECK-NEXT: s_mov_b32 s14, s9 @@ -111,84 +97,68 @@ ; CHECK-NEXT: s_mov_b32 s17, s6 ; CHECK-NEXT: s_mov_b32 s18, s5 ; CHECK-NEXT: s_mov_b32 s19, s4 -; CHECK-NEXT: v_writelane_b32 v0, s12, 5 -; CHECK-NEXT: v_writelane_b32 v0, s13, 6 -; CHECK-NEXT: v_writelane_b32 v0, s14, 7 -; CHECK-NEXT: v_writelane_b32 v0, s15, 8 -; CHECK-NEXT: v_writelane_b32 v0, s16, 9 -; CHECK-NEXT: v_writelane_b32 v0, s17, 10 -; CHECK-NEXT: v_writelane_b32 v0, s18, 11 -; CHECK-NEXT: v_writelane_b32 v0, s19, 12 -; CHECK-NEXT: v_mov_b32_e32 v7, v9 -; CHECK-NEXT: v_mov_b32_e32 v8, v10 -; CHECK-NEXT: v_mov_b32_e32 v5, v11 -; CHECK-NEXT: v_mov_b32_e32 v6, v12 -; CHECK-NEXT: v_mov_b32_e32 v3, v13 -; CHECK-NEXT: v_mov_b32_e32 v4, v14 -; CHECK-NEXT: v_mov_b32_e32 v1, v15 -; CHECK-NEXT: v_mov_b32_e32 v2, v16 +; CHECK-NEXT: v_writelane_b32 v8, s12, 5 +; CHECK-NEXT: v_writelane_b32 v8, s13, 6 +; CHECK-NEXT: v_writelane_b32 v8, s14, 7 +; CHECK-NEXT: v_writelane_b32 v8, s15, 8 +; CHECK-NEXT: v_writelane_b32 v8, s16, 9 +; CHECK-NEXT: v_writelane_b32 v8, s17, 10 +; CHECK-NEXT: v_writelane_b32 v8, s18, 11 +; CHECK-NEXT: v_writelane_b32 v8, s19, 12 +; CHECK-NEXT: v_mov_b32_e32 v6, v9 +; CHECK-NEXT: v_mov_b32_e32 v7, v10 +; CHECK-NEXT: v_mov_b32_e32 v4, v11 +; CHECK-NEXT: v_mov_b32_e32 v5, v12 +; CHECK-NEXT: v_mov_b32_e32 v2, v13 +; CHECK-NEXT: v_mov_b32_e32 v3, v14 +; CHECK-NEXT: v_mov_b32_e32 v0, v15 +; CHECK-NEXT: v_mov_b32_e32 v1, v16 ; CHECK-NEXT: s_mov_b64 s[4:5], s[12:13] ; CHECK-NEXT: s_mov_b64 s[10:11], s[14:15] ; CHECK-NEXT: s_mov_b64 s[8:9], s[16:17] ; CHECK-NEXT: s_mov_b64 s[6:7], s[18:19] -; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[7:8] -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[5:6] +; CHECK-NEXT: v_cmp_eq_u64_e64 s4, s[4:5], v[6:7] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[10:11], v[4:5] ; CHECK-NEXT: s_and_b32 s4, s4, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[3:4] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[8:9], v[2:3] ; CHECK-NEXT: s_and_b32 s4, s4, s5 -; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[1:2] +; CHECK-NEXT: v_cmp_eq_u64_e64 s5, s[6:7], v[0:1] ; CHECK-NEXT: s_and_b32 s4, s4, s5 ; CHECK-NEXT: s_and_saveexec_b32 s4, s4 -; CHECK-NEXT: v_writelane_b32 v0, s4, 13 -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: v_writelane_b32 v8, s4, 13 ; CHECK-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: v_readlane_b32 s4, v8, 13 +; CHECK-NEXT: v_readlane_b32 s8, v8, 5 +; CHECK-NEXT: v_readlane_b32 s9, v8, 6 +; CHECK-NEXT: v_readlane_b32 s10, v8, 7 +; CHECK-NEXT: v_readlane_b32 s11, v8, 8 +; CHECK-NEXT: v_readlane_b32 s12, v8, 9 +; CHECK-NEXT: v_readlane_b32 s13, v8, 10 +; CHECK-NEXT: v_readlane_b32 s14, v8, 11 +; CHECK-NEXT: v_readlane_b32 s15, v8, 12 +; CHECK-NEXT: v_readlane_b32 s16, v8, 0 +; CHECK-NEXT: v_readlane_b32 s17, v8, 1 +; CHECK-NEXT: v_readlane_b32 s18, v8, 2 +; CHECK-NEXT: v_readlane_b32 s19, v8, 3 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v2, 13 -; CHECK-NEXT: v_readlane_b32 s8, v2, 5 -; CHECK-NEXT: v_readlane_b32 s9, v2, 6 -; CHECK-NEXT: v_readlane_b32 s10, v2, 7 -; CHECK-NEXT: v_readlane_b32 s11, v2, 8 -; CHECK-NEXT: v_readlane_b32 s12, v2, 9 -; CHECK-NEXT: v_readlane_b32 s13, v2, 10 -; CHECK-NEXT: v_readlane_b32 s14, v2, 11 -; CHECK-NEXT: v_readlane_b32 s15, v2, 12 -; CHECK-NEXT: v_readlane_b32 s16, v2, 0 -; CHECK-NEXT: v_readlane_b32 s17, v2, 1 -; CHECK-NEXT: v_readlane_b32 s18, v2, 2 -; CHECK-NEXT: v_readlane_b32 s19, v2, 3 ; CHECK-NEXT: image_sample v0, v[0:1], s[8:15], s[16:19] dmask:0x1 dim:SQ_RSRC_IMG_2D ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; CHECK-NEXT: s_xor_b32 exec_lo, exec_lo, s4 ; CHECK-NEXT: s_cbranch_execnz .LBB0_1 ; CHECK-NEXT: ; %bb.3: -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: v_readlane_b32 s4, v0, 4 +; CHECK-NEXT: v_readlane_b32 s4, v8, 4 ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: ; %bb.4: -; CHECK-NEXT: s_or_saveexec_b32 s21, -1 -; CHECK-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s21 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: v_mov_b32_e32 v1, s4 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 ; CHECK-NEXT: v_mov_b32_e32 v3, s4 -; CHECK-NEXT: ; kill: killed $vgpr4 ; CHECK-NEXT: s_xor_saveexec_b32 s4, -1 -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b32 exec_lo, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll --- a/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll +++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/localizer.ll @@ -236,21 +236,23 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[18:19] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dword v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s16, 2 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s16, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_swappc_b64 s[30:31], 0 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll --- a/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll +++ b/llvm/test/CodeGen/AMDGPU/abi-attribute-hints-undefined-behavior.ll @@ -21,10 +21,11 @@ ; FIXEDABI-NEXT: s_mov_b32 s33, s32 ; FIXEDABI-NEXT: s_or_saveexec_b64 s[18:19], -1 ; FIXEDABI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; FIXEDABI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FIXEDABI-NEXT: s_mov_b64 exec, s[18:19] -; FIXEDABI-NEXT: v_writelane_b32 v40, s16, 2 ; FIXEDABI-NEXT: s_addk_i32 s32, 0x400 ; FIXEDABI-NEXT: v_writelane_b32 v40, s30, 0 +; FIXEDABI-NEXT: v_writelane_b32 v41, s16, 0 ; FIXEDABI-NEXT: v_writelane_b32 v40, s31, 1 ; FIXEDABI-NEXT: s_getpc_b64 s[16:17] ; FIXEDABI-NEXT: s_add_u32 s16, s16, requires_all_inputs@rel32@lo+4 @@ -32,9 +33,10 @@ ; FIXEDABI-NEXT: s_swappc_b64 s[30:31], s[16:17] ; FIXEDABI-NEXT: v_readlane_b32 s31, v40, 1 ; FIXEDABI-NEXT: v_readlane_b32 s30, v40, 0 -; FIXEDABI-NEXT: v_readlane_b32 s4, v40, 2 +; FIXEDABI-NEXT: v_readlane_b32 s4, v41, 0 ; FIXEDABI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; FIXEDABI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; FIXEDABI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FIXEDABI-NEXT: s_mov_b64 exec, s[6:7] ; FIXEDABI-NEXT: s_addk_i32 s32, 0xfc00 ; FIXEDABI-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll --- a/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll +++ b/llvm/test/CodeGen/AMDGPU/amdpal-callable.ll @@ -178,19 +178,19 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: no_stack_extern_call_many_args: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x90{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: no_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: simple_lds: ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x20{{$}} @@ -200,7 +200,7 @@ ; GCN-NEXT: .lds_size: 0x100{{$}} ; GCN-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x10{{$}} -; GCN-NEXT: .vgpr_count: 0x29{{$}} +; GCN-NEXT: .vgpr_count: 0x2a{{$}} ; GCN-NEXT: simple_stack: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x21{{$}} @@ -216,16 +216,16 @@ ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: simple_stack_indirect_call: ; GCN-NEXT: .lds_size: 0{{$}} ; GFX8-NEXT: .sgpr_count: 0x28{{$}} ; GFX9-NEXT: .sgpr_count: 0x2c{{$}} -; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2b{{$}} +; GCN-NEXT: .stack_frame_size_in_bytes: 0x30{{$}} +; GCN-NEXT: .vgpr_count: 0x2c{{$}} ; GCN-NEXT: simple_stack_recurse: ; GCN-NEXT: .lds_size: 0{{$}} ; GCN-NEXT: .sgpr_count: 0x28{{$}} ; GCN-NEXT: .stack_frame_size_in_bytes: 0x20{{$}} -; GCN-NEXT: .vgpr_count: 0x2a{{$}} +; GCN-NEXT: .vgpr_count: 0x2b{{$}} ; GCN-NEXT: ... diff --git a/llvm/test/CodeGen/AMDGPU/bf16.ll b/llvm/test/CodeGen/AMDGPU/bf16.ll --- a/llvm/test/CodeGen/AMDGPU/bf16.ll +++ b/llvm/test/CodeGen/AMDGPU/bf16.ll @@ -1371,7 +1371,7 @@ ; GCN-LABEL: test_call: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1395,14 +1395,14 @@ ; GCN-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1425,14 +1425,14 @@ ; GFX7-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1455,14 +1455,14 @@ ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1484,14 +1484,14 @@ ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1515,7 +1515,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1528,7 +1528,7 @@ ; GCN-LABEL: test_call_v2bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1556,14 +1556,14 @@ ; GCN-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v2bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1590,14 +1590,14 @@ ; GFX7-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v2bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1619,14 +1619,14 @@ ; GFX8-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v2bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1648,14 +1648,14 @@ ; GFX9-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v2bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1679,7 +1679,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1692,7 +1692,7 @@ ; GCN-LABEL: test_call_v3bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1721,14 +1721,14 @@ ; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v3bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v4, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1756,14 +1756,14 @@ ; GFX7-NEXT: buffer_load_dword v4, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v3bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1789,14 +1789,14 @@ ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v3bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1824,14 +1824,14 @@ ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v3bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1860,7 +1860,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -1873,7 +1873,7 @@ ; GCN-LABEL: test_call_v4bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1909,14 +1909,14 @@ ; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v4bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1951,14 +1951,14 @@ ; GFX7-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v4bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1991,14 +1991,14 @@ ; GFX8-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v4bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2026,14 +2026,14 @@ ; GFX9-NEXT: buffer_load_dword v3, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v4bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v3, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2063,7 +2063,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -2076,7 +2076,7 @@ ; GCN-LABEL: test_call_v8bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2128,14 +2128,14 @@ ; GCN-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v8bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2186,14 +2186,14 @@ ; GFX7-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v8bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2240,14 +2240,14 @@ ; GFX8-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v8bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2283,14 +2283,14 @@ ; GFX9-NEXT: buffer_load_dword v5, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v8bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v5, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2328,7 +2328,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: @@ -2341,7 +2341,7 @@ ; GCN-LABEL: test_call_v16bf16: ; GCN: ; %bb.0: ; %entry ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s8, s33 +; GCN-NEXT: s_mov_b32 s10, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2425,14 +2425,14 @@ ; GCN-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s8 +; GCN-NEXT: s_mov_b32 s33, s10 ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GFX7-LABEL: test_call_v16bf16: ; GFX7: ; %bb.0: ; %entry ; GFX7-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX7-NEXT: s_mov_b32 s8, s33 +; GFX7-NEXT: s_mov_b32 s10, s33 ; GFX7-NEXT: s_mov_b32 s33, s32 ; GFX7-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX7-NEXT: buffer_store_dword v17, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2515,14 +2515,14 @@ ; GFX7-NEXT: buffer_load_dword v17, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX7-NEXT: s_mov_b64 exec, s[4:5] ; GFX7-NEXT: s_addk_i32 s32, 0xfc00 -; GFX7-NEXT: s_mov_b32 s33, s8 +; GFX7-NEXT: s_mov_b32 s33, s10 ; GFX7-NEXT: s_waitcnt vmcnt(0) ; GFX7-NEXT: s_setpc_b64 s[30:31] ; ; GFX8-LABEL: test_call_v16bf16: ; GFX8: ; %bb.0: ; %entry ; GFX8-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX8-NEXT: s_mov_b32 s6, s33 +; GFX8-NEXT: s_mov_b32 s8, s33 ; GFX8-NEXT: s_mov_b32 s33, s32 ; GFX8-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX8-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2597,14 +2597,14 @@ ; GFX8-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX8-NEXT: s_mov_b64 exec, s[4:5] ; GFX8-NEXT: s_addk_i32 s32, 0xfc00 -; GFX8-NEXT: s_mov_b32 s33, s6 +; GFX8-NEXT: s_mov_b32 s33, s8 ; GFX8-NEXT: s_waitcnt vmcnt(0) ; GFX8-NEXT: s_setpc_b64 s[30:31] ; ; GFX9-LABEL: test_call_v16bf16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2656,14 +2656,14 @@ ; GFX9-NEXT: buffer_load_dword v9, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: test_call_v16bf16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v9, off, s[0:3], s33 ; 4-byte Folded Spill @@ -2717,7 +2717,7 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] entry: diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage-agpr.ll @@ -8,12 +8,12 @@ @alias = hidden alias void (), ptr @aliasee_default ; ALL-LABEL: {{^}}kernel: -; GFX908: .amdhsa_next_free_vgpr 32 -; GFX908-NEXT: .amdhsa_next_free_sgpr 36 +; GFX908: .amdhsa_next_free_vgpr 41 +; GFX908-NEXT: .amdhsa_next_free_sgpr 33 -; GFX90A: .amdhsa_next_free_vgpr 65 -; GFX90A-NEXT: .amdhsa_next_free_sgpr 36 -; GFX90A-NEXT: .amdhsa_accum_offset 32 +; GFX90A: .amdhsa_next_free_vgpr 71 +; GFX90A-NEXT: .amdhsa_next_free_sgpr 33 +; GFX90A-NEXT: .amdhsa_accum_offset 44 define amdgpu_kernel void @kernel() #0 { bb: call void @alias() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage0.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel0: ; CHECK: .amdhsa_next_free_vgpr 53 -; CHECK-NEXT: .amdhsa_next_free_sgpr 36 +; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel0() #0 { bb: call void @alias0() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage1.ll @@ -9,8 +9,8 @@ ; The parent kernel has a higher VGPR usage than the possible callees. ; CHECK-LABEL: {{^}}kernel1: -; CHECK: .amdhsa_next_free_vgpr 41 -; CHECK-NEXT: .amdhsa_next_free_sgpr 36 +; CHECK: .amdhsa_next_free_vgpr 42 +; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel1() #0 { bb: call void asm sideeffect "; clobber v40 ", "~{v40}"() diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage2.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel2: ; CHECK: .amdhsa_next_free_vgpr 53 -; CHECK-NEXT: .amdhsa_next_free_sgpr 36 +; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel2() #0 { bb: call void @alias2() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll --- a/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll +++ b/llvm/test/CodeGen/AMDGPU/call-alias-register-usage3.ll @@ -8,7 +8,7 @@ ; CHECK-LABEL: {{^}}kernel3: ; CHECK: .amdhsa_next_free_vgpr 253 -; CHECK-NEXT: .amdhsa_next_free_sgpr 36 +; CHECK-NEXT: .amdhsa_next_free_sgpr 33 define amdgpu_kernel void @kernel3() #0 { bb: call void @alias3() #2 diff --git a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/call-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/call-argument-types.ll @@ -5823,6 +5823,7 @@ ; VI-NEXT: s_mov_b32 s33, s32 ; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[8:9] ; VI-NEXT: s_addk_i32 s32, 0x400 ; VI-NEXT: v_mov_b32_e32 v0, 11 @@ -5832,7 +5833,6 @@ ; VI-NEXT: v_mov_b32_e32 v0, 13 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; VI-NEXT: v_mov_b32_e32 v0, 14 -; VI-NEXT: v_writelane_b32 v40, s4, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; VI-NEXT: v_mov_b32_e32 v0, 15 ; VI-NEXT: v_writelane_b32 v40, s30, 0 @@ -5868,6 +5868,7 @@ ; VI-NEXT: v_mov_b32_e32 v28, 9 ; VI-NEXT: v_mov_b32_e32 v29, 9 ; VI-NEXT: v_mov_b32_e32 v30, 10 +; VI-NEXT: v_writelane_b32 v41, s4, 0 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 @@ -5875,9 +5876,10 @@ ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 -; VI-NEXT: v_readlane_b32 s4, v40, 2 +; VI-NEXT: v_readlane_b32 s4, v41, 0 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] ; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 @@ -5891,6 +5893,7 @@ ; CI-NEXT: s_mov_b32 s33, s32 ; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CI-NEXT: s_mov_b64 exec, s[8:9] ; CI-NEXT: s_addk_i32 s32, 0x400 ; CI-NEXT: v_mov_b32_e32 v0, 11 @@ -5900,7 +5903,6 @@ ; CI-NEXT: v_mov_b32_e32 v0, 13 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; CI-NEXT: v_mov_b32_e32 v0, 14 -; CI-NEXT: v_writelane_b32 v40, s4, 2 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; CI-NEXT: v_mov_b32_e32 v0, 15 ; CI-NEXT: v_writelane_b32 v40, s30, 0 @@ -5936,6 +5938,7 @@ ; CI-NEXT: v_mov_b32_e32 v28, 9 ; CI-NEXT: v_mov_b32_e32 v29, 9 ; CI-NEXT: v_mov_b32_e32 v30, 10 +; CI-NEXT: v_writelane_b32 v41, s4, 0 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 @@ -5943,9 +5946,10 @@ ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 -; CI-NEXT: v_readlane_b32 s4, v40, 2 +; CI-NEXT: v_readlane_b32 s4, v41, 0 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] ; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 @@ -5959,6 +5963,7 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[8:9] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 11 @@ -5968,7 +5973,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 -; GFX9-NEXT: v_writelane_b32 v40, s4, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -6004,6 +6008,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v28, 9 ; GFX9-NEXT: v_mov_b32_e32 v29, 9 ; GFX9-NEXT: v_mov_b32_e32 v30, 10 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 @@ -6011,9 +6016,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -6026,15 +6032,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 11 :: v_dual_mov_b32 v1, 12 ; GFX11-NEXT: v_dual_mov_b32 v2, 13 :: v_dual_mov_b32 v3, 14 ; GFX11-NEXT: v_mov_b32_e32 v4, 15 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b32 off, v4, s0 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v0, 0 @@ -6061,9 +6069,11 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6077,6 +6087,7 @@ ; HSA-NEXT: s_mov_b32 s33, s32 ; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 ; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; HSA-NEXT: s_mov_b64 exec, s[8:9] ; HSA-NEXT: s_addk_i32 s32, 0x400 ; HSA-NEXT: v_mov_b32_e32 v0, 11 @@ -6086,7 +6097,6 @@ ; HSA-NEXT: v_mov_b32_e32 v0, 13 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; HSA-NEXT: v_mov_b32_e32 v0, 14 -; HSA-NEXT: v_writelane_b32 v40, s4, 2 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; HSA-NEXT: v_mov_b32_e32 v0, 15 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 @@ -6122,6 +6132,7 @@ ; HSA-NEXT: v_mov_b32_e32 v28, 9 ; HSA-NEXT: v_mov_b32_e32 v29, 9 ; HSA-NEXT: v_mov_b32_e32 v30, 10 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3i32@rel32@lo+4 @@ -6129,9 +6140,10 @@ ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 -; HSA-NEXT: v_readlane_b32 s4, v40, 2 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] ; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 @@ -6162,6 +6174,7 @@ ; VI-NEXT: s_mov_b32 s33, s32 ; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[8:9] ; VI-NEXT: s_addk_i32 s32, 0x400 ; VI-NEXT: v_mov_b32_e32 v0, 0x41300000 @@ -6171,7 +6184,6 @@ ; VI-NEXT: v_mov_b32_e32 v0, 0x41500000 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; VI-NEXT: v_mov_b32_e32 v0, 0x41600000 -; VI-NEXT: v_writelane_b32 v40, s4, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; VI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; VI-NEXT: v_writelane_b32 v40, s30, 0 @@ -6207,6 +6219,7 @@ ; VI-NEXT: v_mov_b32_e32 v28, 0x41100000 ; VI-NEXT: v_mov_b32_e32 v29, 0x41100000 ; VI-NEXT: v_mov_b32_e32 v30, 0x41200000 +; VI-NEXT: v_writelane_b32 v41, s4, 0 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 @@ -6214,9 +6227,10 @@ ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 -; VI-NEXT: v_readlane_b32 s4, v40, 2 +; VI-NEXT: v_readlane_b32 s4, v41, 0 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] ; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 @@ -6230,6 +6244,7 @@ ; CI-NEXT: s_mov_b32 s33, s32 ; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CI-NEXT: s_mov_b64 exec, s[8:9] ; CI-NEXT: s_addk_i32 s32, 0x400 ; CI-NEXT: v_mov_b32_e32 v0, 0x41300000 @@ -6239,7 +6254,6 @@ ; CI-NEXT: v_mov_b32_e32 v0, 0x41500000 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; CI-NEXT: v_mov_b32_e32 v0, 0x41600000 -; CI-NEXT: v_writelane_b32 v40, s4, 2 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; CI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; CI-NEXT: v_writelane_b32 v40, s30, 0 @@ -6275,6 +6289,7 @@ ; CI-NEXT: v_mov_b32_e32 v28, 0x41100000 ; CI-NEXT: v_mov_b32_e32 v29, 0x41100000 ; CI-NEXT: v_mov_b32_e32 v30, 0x41200000 +; CI-NEXT: v_writelane_b32 v41, s4, 0 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 @@ -6282,9 +6297,10 @@ ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 -; CI-NEXT: v_readlane_b32 s4, v40, 2 +; CI-NEXT: v_readlane_b32 s4, v41, 0 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] ; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 @@ -6298,6 +6314,7 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[8:9] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41300000 @@ -6307,7 +6324,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 -; GFX9-NEXT: v_writelane_b32 v40, s4, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -6343,6 +6359,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v28, 0x41100000 ; GFX9-NEXT: v_mov_b32_e32 v29, 0x41100000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x41200000 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 @@ -6350,9 +6367,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -6365,17 +6383,19 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x41300000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x41400000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41500000 ; GFX11-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX11-NEXT: v_dual_mov_b32 v4, 0x41700000 :: v_dual_mov_b32 v5, 1.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b32 off, v4, s0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 @@ -6404,9 +6424,11 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6420,6 +6442,7 @@ ; HSA-NEXT: s_mov_b32 s33, s32 ; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 ; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; HSA-NEXT: s_mov_b64 exec, s[8:9] ; HSA-NEXT: s_addk_i32 s32, 0x400 ; HSA-NEXT: v_mov_b32_e32 v0, 0x41300000 @@ -6429,7 +6452,6 @@ ; HSA-NEXT: v_mov_b32_e32 v0, 0x41500000 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; HSA-NEXT: v_mov_b32_e32 v0, 0x41600000 -; HSA-NEXT: v_writelane_b32 v40, s4, 2 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 ; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 @@ -6465,6 +6487,7 @@ ; HSA-NEXT: v_mov_b32_e32 v28, 0x41100000 ; HSA-NEXT: v_mov_b32_e32 v29, 0x41100000 ; HSA-NEXT: v_mov_b32_e32 v30, 0x41200000 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_12xv3f32@rel32@lo+4 @@ -6472,9 +6495,10 @@ ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 -; HSA-NEXT: v_readlane_b32 s4, v40, 2 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] ; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 @@ -6505,6 +6529,7 @@ ; VI-NEXT: s_mov_b32 s33, s32 ; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[8:9] ; VI-NEXT: s_addk_i32 s32, 0x400 ; VI-NEXT: v_mov_b32_e32 v0, 7 @@ -6522,7 +6547,6 @@ ; VI-NEXT: v_mov_b32_e32 v0, 13 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; VI-NEXT: v_mov_b32_e32 v0, 14 -; VI-NEXT: v_writelane_b32 v40, s4, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; VI-NEXT: v_mov_b32_e32 v0, 15 ; VI-NEXT: v_writelane_b32 v40, s30, 0 @@ -6558,6 +6582,7 @@ ; VI-NEXT: v_mov_b32_e32 v28, 5 ; VI-NEXT: v_mov_b32_e32 v29, 5 ; VI-NEXT: v_mov_b32_e32 v30, 6 +; VI-NEXT: v_writelane_b32 v41, s4, 0 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 @@ -6565,9 +6590,10 @@ ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 -; VI-NEXT: v_readlane_b32 s4, v40, 2 +; VI-NEXT: v_readlane_b32 s4, v41, 0 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] ; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 @@ -6581,6 +6607,7 @@ ; CI-NEXT: s_mov_b32 s33, s32 ; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CI-NEXT: s_mov_b64 exec, s[8:9] ; CI-NEXT: s_addk_i32 s32, 0x400 ; CI-NEXT: v_mov_b32_e32 v0, 7 @@ -6598,7 +6625,6 @@ ; CI-NEXT: v_mov_b32_e32 v0, 13 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; CI-NEXT: v_mov_b32_e32 v0, 14 -; CI-NEXT: v_writelane_b32 v40, s4, 2 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; CI-NEXT: v_mov_b32_e32 v0, 15 ; CI-NEXT: v_writelane_b32 v40, s30, 0 @@ -6634,6 +6660,7 @@ ; CI-NEXT: v_mov_b32_e32 v28, 5 ; CI-NEXT: v_mov_b32_e32 v29, 5 ; CI-NEXT: v_mov_b32_e32 v30, 6 +; CI-NEXT: v_writelane_b32 v41, s4, 0 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 @@ -6641,9 +6668,10 @@ ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 -; CI-NEXT: v_readlane_b32 s4, v40, 2 +; CI-NEXT: v_readlane_b32 s4, v41, 0 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] ; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 @@ -6657,6 +6685,7 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[8:9] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 7 @@ -6674,7 +6703,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 -; GFX9-NEXT: v_writelane_b32 v40, s4, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -6710,6 +6738,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v28, 5 ; GFX9-NEXT: v_mov_b32_e32 v29, 5 ; GFX9-NEXT: v_mov_b32_e32 v30, 6 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 @@ -6717,9 +6746,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -6732,15 +6762,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 7 :: v_dual_mov_b32 v1, 8 ; GFX11-NEXT: v_dual_mov_b32 v2, 9 :: v_dual_mov_b32 v3, 10 ; GFX11-NEXT: v_dual_mov_b32 v8, 15 :: v_dual_mov_b32 v5, 12 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_dual_mov_b32 v4, 11 :: v_dual_mov_b32 v7, 14 ; GFX11-NEXT: v_mov_b32_e32 v6, 13 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: s_add_i32 s1, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 @@ -6772,9 +6804,11 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6788,6 +6822,7 @@ ; HSA-NEXT: s_mov_b32 s33, s32 ; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 ; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; HSA-NEXT: s_mov_b64 exec, s[8:9] ; HSA-NEXT: s_addk_i32 s32, 0x400 ; HSA-NEXT: v_mov_b32_e32 v0, 7 @@ -6805,7 +6840,6 @@ ; HSA-NEXT: v_mov_b32_e32 v0, 13 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; HSA-NEXT: v_mov_b32_e32 v0, 14 -; HSA-NEXT: v_writelane_b32 v40, s4, 2 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; HSA-NEXT: v_mov_b32_e32 v0, 15 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 @@ -6841,6 +6875,7 @@ ; HSA-NEXT: v_mov_b32_e32 v28, 5 ; HSA-NEXT: v_mov_b32_e32 v29, 5 ; HSA-NEXT: v_mov_b32_e32 v30, 6 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5i32@rel32@lo+4 @@ -6848,9 +6883,10 @@ ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 -; HSA-NEXT: v_readlane_b32 s4, v40, 2 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] ; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 @@ -6877,6 +6913,7 @@ ; VI-NEXT: s_mov_b32 s33, s32 ; VI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; VI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; VI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; VI-NEXT: s_mov_b64 exec, s[8:9] ; VI-NEXT: s_addk_i32 s32, 0x400 ; VI-NEXT: v_mov_b32_e32 v0, 0x40e00000 @@ -6894,7 +6931,6 @@ ; VI-NEXT: v_mov_b32_e32 v0, 0x41500000 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; VI-NEXT: v_mov_b32_e32 v0, 0x41600000 -; VI-NEXT: v_writelane_b32 v40, s4, 2 ; VI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; VI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; VI-NEXT: v_writelane_b32 v40, s30, 0 @@ -6930,6 +6966,7 @@ ; VI-NEXT: v_mov_b32_e32 v28, 0x40a00000 ; VI-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; VI-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; VI-NEXT: v_writelane_b32 v41, s4, 0 ; VI-NEXT: v_writelane_b32 v40, s31, 1 ; VI-NEXT: s_getpc_b64 s[4:5] ; VI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 @@ -6937,9 +6974,10 @@ ; VI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; VI-NEXT: v_readlane_b32 s31, v40, 1 ; VI-NEXT: v_readlane_b32 s30, v40, 0 -; VI-NEXT: v_readlane_b32 s4, v40, 2 +; VI-NEXT: v_readlane_b32 s4, v41, 0 ; VI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; VI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; VI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; VI-NEXT: s_mov_b64 exec, s[6:7] ; VI-NEXT: s_addk_i32 s32, 0xfc00 ; VI-NEXT: s_mov_b32 s33, s4 @@ -6953,6 +6991,7 @@ ; CI-NEXT: s_mov_b32 s33, s32 ; CI-NEXT: s_or_saveexec_b64 s[8:9], -1 ; CI-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CI-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CI-NEXT: s_mov_b64 exec, s[8:9] ; CI-NEXT: s_addk_i32 s32, 0x400 ; CI-NEXT: v_mov_b32_e32 v0, 0x40e00000 @@ -6970,7 +7009,6 @@ ; CI-NEXT: v_mov_b32_e32 v0, 0x41500000 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; CI-NEXT: v_mov_b32_e32 v0, 0x41600000 -; CI-NEXT: v_writelane_b32 v40, s4, 2 ; CI-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; CI-NEXT: v_mov_b32_e32 v0, 0x41700000 ; CI-NEXT: v_writelane_b32 v40, s30, 0 @@ -7006,6 +7044,7 @@ ; CI-NEXT: v_mov_b32_e32 v28, 0x40a00000 ; CI-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; CI-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; CI-NEXT: v_writelane_b32 v41, s4, 0 ; CI-NEXT: v_writelane_b32 v40, s31, 1 ; CI-NEXT: s_getpc_b64 s[4:5] ; CI-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 @@ -7013,9 +7052,10 @@ ; CI-NEXT: s_swappc_b64 s[30:31], s[4:5] ; CI-NEXT: v_readlane_b32 s31, v40, 1 ; CI-NEXT: v_readlane_b32 s30, v40, 0 -; CI-NEXT: v_readlane_b32 s4, v40, 2 +; CI-NEXT: v_readlane_b32 s4, v41, 0 ; CI-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CI-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CI-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CI-NEXT: s_mov_b64 exec, s[6:7] ; CI-NEXT: s_addk_i32 s32, 0xfc00 ; CI-NEXT: s_mov_b32 s33, s4 @@ -7029,6 +7069,7 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[8:9] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40e00000 @@ -7046,7 +7087,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 -; GFX9-NEXT: v_writelane_b32 v40, s4, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -7082,6 +7122,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v28, 0x40a00000 ; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; GFX9-NEXT: v_writelane_b32 v41, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 @@ -7089,9 +7130,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[4:5] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s4, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -7104,9 +7146,10 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40e00000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41100000 @@ -7117,6 +7160,7 @@ ; GFX11-NEXT: v_mov_b32_e32 v5, 0x41400000 ; GFX11-NEXT: v_dual_mov_b32 v6, 0x41500000 :: v_dual_mov_b32 v9, 1.0 ; GFX11-NEXT: v_mov_b32_e32 v7, 0x41600000 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 32 ; GFX11-NEXT: s_add_i32 s1, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 @@ -7147,9 +7191,11 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7163,6 +7209,7 @@ ; HSA-NEXT: s_mov_b32 s33, s32 ; HSA-NEXT: s_or_saveexec_b64 s[8:9], -1 ; HSA-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; HSA-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; HSA-NEXT: s_mov_b64 exec, s[8:9] ; HSA-NEXT: s_addk_i32 s32, 0x400 ; HSA-NEXT: v_mov_b32_e32 v0, 0x40e00000 @@ -7180,7 +7227,6 @@ ; HSA-NEXT: v_mov_b32_e32 v0, 0x41500000 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; HSA-NEXT: v_mov_b32_e32 v0, 0x41600000 -; HSA-NEXT: v_writelane_b32 v40, s4, 2 ; HSA-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; HSA-NEXT: v_mov_b32_e32 v0, 0x41700000 ; HSA-NEXT: v_writelane_b32 v40, s30, 0 @@ -7216,6 +7262,7 @@ ; HSA-NEXT: v_mov_b32_e32 v28, 0x40a00000 ; HSA-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; HSA-NEXT: v_mov_b32_e32 v30, 0x40c00000 +; HSA-NEXT: v_writelane_b32 v41, s4, 0 ; HSA-NEXT: v_writelane_b32 v40, s31, 1 ; HSA-NEXT: s_getpc_b64 s[4:5] ; HSA-NEXT: s_add_u32 s4, s4, external_void_func_8xv5f32@rel32@lo+4 @@ -7223,9 +7270,10 @@ ; HSA-NEXT: s_swappc_b64 s[30:31], s[4:5] ; HSA-NEXT: v_readlane_b32 s31, v40, 1 ; HSA-NEXT: v_readlane_b32 s30, v40, 0 -; HSA-NEXT: v_readlane_b32 s4, v40, 2 +; HSA-NEXT: v_readlane_b32 s4, v41, 0 ; HSA-NEXT: s_or_saveexec_b64 s[6:7], -1 ; HSA-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; HSA-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; HSA-NEXT: s_mov_b64 exec, s[6:7] ; HSA-NEXT: s_addk_i32 s32, 0xfc00 ; HSA-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll --- a/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll +++ b/llvm/test/CodeGen/AMDGPU/call-graph-register-usage.ll @@ -15,17 +15,17 @@ ; GCN-LABEL: {{^}}indirect_use_vcc: ; GCN: s_mov_b32 s4, s33 -; GCN: v_writelane_b32 v40, s4, 2 +; GCN: v_writelane_b32 v41, s4, 0 ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 ; GCN: s_swappc_b64 ; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 -; GCN: v_readlane_b32 s4, v40, 2 +; GCN: v_readlane_b32 s4, v41, 0 ; GCN: s_mov_b32 s33, s4 ; GCN: s_setpc_b64 s[30:31] ; GCN: ; NumSgprs: 36 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define void @indirect_use_vcc() #1 { call void @use_vcc() ret void @@ -36,7 +36,7 @@ ; CI: ; NumSgprs: 38 ; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define amdgpu_kernel void @indirect_2level_use_vcc_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_vcc() ret void @@ -54,7 +54,7 @@ ; GCN-LABEL: {{^}}indirect_use_flat_scratch: ; CI: ; NumSgprs: 38 ; VI: ; NumSgprs: 40 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define void @indirect_use_flat_scratch() #1 { call void @use_flat_scratch() ret void @@ -65,7 +65,7 @@ ; CI: ; NumSgprs: 38 ; VI-NOBUG: ; NumSgprs: 40 ; VI-BUG: ; NumSgprs: 96 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define amdgpu_kernel void @indirect_2level_use_flat_scratch_kernel(ptr addrspace(1) %out) #0 { call void @indirect_use_flat_scratch() ret void @@ -80,7 +80,7 @@ } ; GCN-LABEL: {{^}}indirect_use_10_vgpr: -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define void @indirect_use_10_vgpr() #0 { call void @use_10_vgpr() ret void @@ -88,7 +88,7 @@ ; GCN-LABEL: {{^}}indirect_2_level_use_10_vgpr: ; GCN: is_dynamic_callstack = 0 -; GCN: ; NumVgprs: 41 +; GCN: ; NumVgprs: 42 define amdgpu_kernel void @indirect_2_level_use_10_vgpr() #0 { call void @indirect_use_10_vgpr() ret void diff --git a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/call-preserved-registers.ll @@ -24,11 +24,13 @@ ; GCN-LABEL: {{^}}test_func_call_external_void_func_void_clobber_s30_s31_call_external_void_func_void: ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; MUBUF: buffer_store_dword +; MUBUF: buffer_store_dword +; FLATSCR: scratch_store_dword ; FLATSCR: scratch_store_dword -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 ; GCN: v_writelane_b32 v40, s30, 0 ; GCN: v_writelane_b32 v40, s31, 1 ; GCN: v_writelane_b32 v40, s34, 2 +; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN: v_writelane_b32 v40, s35, 3 ; GCN: s_swappc_b64 @@ -42,8 +44,10 @@ ; FLATSCR-DAG: v_readlane_b32 s31, v40, 1 ; FLATSCR-DAG: v_readlane_b32 s30, v40, 0 -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 ; MUBUF: buffer_load_dword +; MUBUF: buffer_load_dword +; FLATSCR: scratch_load_dword ; FLATSCR: scratch_load_dword ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 s[30:31] @@ -58,17 +62,21 @@ ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 ; GCN: s_mov_b32 s33, s32 ; MUBUF: buffer_store_dword v40 +; MUBUF: buffer_store_dword v41 ; FLATSCR: scratch_store_dword off, v40 -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 +; FLATSCR: scratch_store_dword off, v41 ; MUBUF: s_addk_i32 s32, 0x400 ; FLATSCR: s_add_i32 s32, s32, 16 +; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN: s_swappc_b64 ; GCN-NEXT: s_swappc_b64 -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 ; MUBUF: buffer_load_dword v40 +; MUBUF: buffer_load_dword v41 ; FLATSCR: scratch_load_dword v40 +; FLATSCR: scratch_load_dword v41 ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] define void @test_func_call_external_void_funcx2() #0 { call void @external_void_func_void() diff --git a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll --- a/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll +++ b/llvm/test/CodeGen/AMDGPU/callee-frame-setup.ll @@ -89,13 +89,15 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], 2 ; MUBUF-DAG: s_addk_i32 s32, 0x400{{$}} ; FLATSCR-DAG: s_add_i32 s32, s32, 16{{$}} ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, ; GCN-DAG: v_mov_b32_e32 [[ZERO:v[0-9]+]], 0{{$}} +; GCN: v_writelane_b32 [[CSR_VGPR_1]], [[FP_SCRATCH_COPY]], 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, ; MUBUF-DAG: buffer_store_dword [[ZERO]], off, s[0:3], s33{{$}} @@ -106,10 +108,12 @@ ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]] ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]] -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], 2 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR_1]], 0 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 offset:4 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; MUBUF: s_addk_i32 s32, 0xfc00{{$}} ; FLATSCR: s_add_i32 s32, s32, -16{{$}} @@ -136,11 +140,13 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR:v[0-9]+]], off, s[0:3], s33 ; 4-byte Folded Spill +; MUBUF-NEXT: buffer_store_dword [[CSR_VGPR_1:v[0-9]+]], off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR:v[0-9]+]], s33 ; 4-byte Folded Spill +; FLATSCR-NEXT: scratch_store_dword off, [[CSR_VGPR_1:v[0-9]+]], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] ; MUBUF-DAG: s_addk_i32 s32, 0x400 ; FLATSCR-DAG: s_add_i32 s32, s32, 16 -; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]] +; GCN-DAG: v_writelane_b32 [[CSR_VGPR_1]], [[FP_SCRATCH_COPY]], [[FP_SPILL_LANE:[0-9]+]] ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s30, 0 ; GCN-DAG: v_writelane_b32 [[CSR_VGPR]], s31, 1 @@ -149,10 +155,12 @@ ; GCN-DAG: v_readlane_b32 s30, [[CSR_VGPR]], 0 ; GCN-DAG: v_readlane_b32 s31, [[CSR_VGPR]], 1 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR]], [[FP_SPILL_LANE]] +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSR_VGPR_1]], [[FP_SPILL_LANE]] ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR]], off, s[0:3], s33 ; 4-byte Folded Reload +; MUBUF-NEXT: buffer_load_dword [[CSR_VGPR_1]], off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR]], off, s33 ; 4-byte Folded Reload +; FLATSCR-NEXT: scratch_load_dword [[CSR_VGPR_1]], off, s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; MUBUF: s_addk_i32 s32, 0xfc00 ; FLATSCR: s_add_i32 s32, s32, -16 @@ -623,14 +631,14 @@ ; Make sure that the FP save happens after restoring exec from the same ; register. ; GCN-LABEL: {{^}}callee_need_to_spill_fp_to_reg: -; FLATSCR: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 +; FLATSCR: s_mov_b32 s2, s33 ; FLATSCR: s_mov_b32 s33, s32 ; GCN-NOT: v_writelane_b32 v40, s33 -; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} -; FLATSCR: s_mov_b64 exec, [[COPY_EXEC0]] -; FLATSCR: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} +; FLATSCR: s_or_saveexec_b64 s[4:5], -1 +; FLATSCR: s_mov_b64 exec, s[4:5] +; FLATSCR: s_or_saveexec_b64 s[4:5], -1 ; GCN-NOT: v_readlane_b32 s33, v40 -; FLATSCR: s_mov_b32 s33, [[FP_SCRATCH_COPY]] +; FLATSCR: s_mov_b32 s33, s2 ; GCN: s_setpc_b64 define void @callee_need_to_spill_fp_to_reg() #1 { call void asm sideeffect "; clobber nonpreserved SGPRs and 64 CSRs", diff --git a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll --- a/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll +++ b/llvm/test/CodeGen/AMDGPU/cf-loop-on-constant.ll @@ -30,13 +30,6 @@ ; ; GCN_DBG-LABEL: test_loop: ; GCN_DBG: ; %bb.0: ; %entry -; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN_DBG-NEXT: s_mov_b32 s14, -1 -; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 -; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 @@ -46,25 +39,11 @@ ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: s_cmp_lg_u32 s1, s2 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_mov_b64 s[6:7], exec -; GCN_DBG-NEXT: s_mov_b64 exec, -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN_DBG-NEXT: ; %bb.1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB0_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -86,16 +65,8 @@ ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB0_2 ; GCN_DBG-NEXT: ; %bb.3: ; %DummyReturnBlock -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm entry: %cmp = icmp eq i32 %n, -1 @@ -138,36 +109,16 @@ ; ; GCN_DBG-LABEL: loop_const_true: ; GCN_DBG: ; %bb.0: ; %entry -; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN_DBG-NEXT: s_mov_b32 s14, -1 -; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 -; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB1_2 ; GCN_DBG-NEXT: .LBB1_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB1_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -189,9 +140,6 @@ ; GCN_DBG-NEXT: s_mov_b64 s[2:3], 0 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB1_1 ; GCN_DBG-NEXT: s_branch .LBB1_2 entry: @@ -226,36 +174,16 @@ ; ; GCN_DBG-LABEL: loop_const_false: ; GCN_DBG: ; %bb.0: ; %entry -; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN_DBG-NEXT: s_mov_b32 s14, -1 -; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 -; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB2_2 ; GCN_DBG-NEXT: .LBB2_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB2_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -277,9 +205,6 @@ ; GCN_DBG-NEXT: s_mov_b64 s[2:3], -1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB2_1 ; GCN_DBG-NEXT: s_branch .LBB2_2 entry: @@ -315,36 +240,16 @@ ; ; GCN_DBG-LABEL: loop_const_undef: ; GCN_DBG: ; %bb.0: ; %entry -; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN_DBG-NEXT: s_mov_b32 s14, -1 -; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 -; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB3_2 ; GCN_DBG-NEXT: .LBB3_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB3_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 0 ; GCN_DBG-NEXT: s_mov_b32 s1, 2 @@ -364,9 +269,6 @@ ; GCN_DBG-NEXT: s_mov_b32 s1, 1 ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_scc1 .LBB3_1 ; GCN_DBG-NEXT: s_branch .LBB3_2 entry: @@ -416,13 +318,6 @@ ; ; GCN_DBG-LABEL: loop_arg_0: ; GCN_DBG: ; %bb.0: ; %entry -; GCN_DBG-NEXT: s_mov_b32 s12, SCRATCH_RSRC_DWORD0 -; GCN_DBG-NEXT: s_mov_b32 s13, SCRATCH_RSRC_DWORD1 -; GCN_DBG-NEXT: s_mov_b32 s14, -1 -; GCN_DBG-NEXT: s_mov_b32 s15, 0xe8f000 -; GCN_DBG-NEXT: s_add_u32 s12, s12, s11 -; GCN_DBG-NEXT: s_addc_u32 s13, s13, 0 -; GCN_DBG-NEXT: ; implicit-def: $vgpr0 ; GCN_DBG-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN_DBG-NEXT: s_waitcnt lgkmcnt(0) ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 0 @@ -440,24 +335,11 @@ ; GCN_DBG-NEXT: v_writelane_b32 v0, s1, 2 ; GCN_DBG-NEXT: s_mov_b32 s0, 0 ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_branch .LBB4_2 ; GCN_DBG-NEXT: .LBB4_1: ; %for.exit -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: ; kill: killed $vgpr0 ; GCN_DBG-NEXT: s_endpgm ; GCN_DBG-NEXT: .LBB4_2: ; %for.body ; GCN_DBG-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: s_waitcnt expcnt(0) -; GCN_DBG-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] -; GCN_DBG-NEXT: s_waitcnt vmcnt(0) ; GCN_DBG-NEXT: v_readlane_b32 s0, v0, 3 ; GCN_DBG-NEXT: v_readlane_b32 s2, v0, 1 ; GCN_DBG-NEXT: v_readlane_b32 s3, v0, 2 @@ -480,9 +362,6 @@ ; GCN_DBG-NEXT: s_add_i32 s0, s0, s1 ; GCN_DBG-NEXT: s_and_b64 vcc, exec, s[2:3] ; GCN_DBG-NEXT: v_writelane_b32 v0, s0, 3 -; GCN_DBG-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN_DBG-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN_DBG-NEXT: s_mov_b64 exec, s[6:7] ; GCN_DBG-NEXT: s_cbranch_vccnz .LBB4_1 ; GCN_DBG-NEXT: s_branch .LBB4_2 entry: diff --git a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll --- a/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll +++ b/llvm/test/CodeGen/AMDGPU/collapse-endcf.ll @@ -48,37 +48,24 @@ ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB0_4 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 @@ -86,69 +73,54 @@ ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v0 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB0_3 ; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 -; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 -; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s2, 2 -; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-O0-NEXT: .LBB0_3: ; %Flow -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB0_4: ; %bb.outer.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v2 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -221,37 +193,24 @@ ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB1_3 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 1 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; kill: def $sgpr0 killed $sgpr0 def $sgpr0_sgpr1 @@ -259,92 +218,76 @@ ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[0:1] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v0 +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s0 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB1_4 ; GCN-O0-NEXT: ; %bb.2: ; %bb.inner.then -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 -; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 -; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s2, 2 -; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB1_4 ; GCN-O0-NEXT: .LBB1_3: ; %Flow -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB1_5 ; GCN-O0-NEXT: .LBB1_4: ; %bb.inner.end -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s2, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s3, v0, 5 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s2, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s3, v1, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[2:3] -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 -; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 -; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 -; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 -; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], v0 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB1_3 ; GCN-O0-NEXT: .LBB1_5: ; %bb.outer.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v2 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -430,19 +373,13 @@ ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 ; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[0:1] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_writelane_b32 v1, s2, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s3, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 @@ -453,130 +390,96 @@ ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v4, 0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s4, 2 ; GCN-O0-NEXT: v_lshl_b64 v[3:4], v[2:3], s4 ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_6 ; GCN-O0-NEXT: ; %bb.1: ; %bb.outer.then -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[0:1], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[2:3], exec ; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] -; GCN-O0-NEXT: v_writelane_b32 v0, s2, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s3, 5 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: v_writelane_b32 v1, s2, 4 +; GCN-O0-NEXT: v_writelane_b32 v1, s3, 5 ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_2 ; GCN-O0-NEXT: s_branch .LBB2_4 ; GCN-O0-NEXT: .LBB2_2: ; %Flow -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 ; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 6 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 7 ; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB2_5 ; GCN-O0-NEXT: ; %bb.3: ; %bb.then -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 1 -; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 -; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 -; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s2, 2 -; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], s2 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s2 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB2_5 ; GCN-O0-NEXT: .LBB2_4: ; %bb.else -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 -; GCN-O0-NEXT: v_add_i32_e64 v1, s[2:3], v1, v0 -; GCN-O0-NEXT: v_ashrrev_i32_e64 v3, 31, v1 -; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v2, v3 -; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[1:2], v0 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_add_i32_e64 v2, s[2:3], v2, v0 +; GCN-O0-NEXT: v_ashrrev_i32_e64 v4, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], v0 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: ; kill: def $sgpr4 killed $sgpr4 def $sgpr4_sgpr5 ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-O0-NEXT: s_branch .LBB2_2 ; GCN-O0-NEXT: .LBB2_5: ; %Flow1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB2_6: ; %bb.outer.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v2 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -676,19 +579,14 @@ ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] ; GCN-O0-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x9 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:16 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: ; implicit-def: $sgpr0 ; GCN-O0-NEXT: v_mov_b32_e32 v4, 0 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 ; GCN-O0-NEXT: v_mov_b32_e32 v3, v4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 ; GCN-O0-NEXT: s_mov_b32 s1, s0 @@ -703,9 +601,9 @@ ; GCN-O0-NEXT: v_addc_u32_e64 v2, s[2:3], v2, v6, s[2:3] ; GCN-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec ; GCN-O0-NEXT: v_mov_b32_e32 v6, v2 -; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v6, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: ; kill: def $sgpr2 killed $sgpr2 def $sgpr2_sgpr3 @@ -714,43 +612,29 @@ ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-O0-NEXT: v_mov_b32_e32 v2, 0 ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 -; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v1, s0 +; GCN-O0-NEXT: v_cmp_lt_u32_e64 s[0:1], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[2:3], exec ; GCN-O0-NEXT: s_and_b64 s[0:1], s[2:3], s[0:1] ; GCN-O0-NEXT: s_xor_b64 s[2:3], s[0:1], s[2:3] -; GCN-O0-NEXT: v_writelane_b32 v0, s2, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s3, 1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v1, s2, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s3, 1 ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_1 ; GCN-O0-NEXT: s_branch .LBB3_4 ; GCN-O0-NEXT: .LBB3_1: ; %Flow2 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-O0-NEXT: s_or_saveexec_b64 s[0:1], s[0:1] ; GCN-O0-NEXT: s_and_b64 s[0:1], exec, s[0:1] -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 ; GCN-O0-NEXT: s_xor_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_8 ; GCN-O0-NEXT: ; %bb.2: ; %bb.outer.then -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -763,20 +647,17 @@ ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[0:3], 0 addr64 offset:4 ; GCN-O0-NEXT: s_mov_b32 s0, 2 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 5 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 4 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 5 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_7 ; GCN-O0-NEXT: ; %bb.3: ; %bb.inner.then -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -785,19 +666,14 @@ ; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v0, 2 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:8 +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:8 ; GCN-O0-NEXT: s_branch .LBB3_7 ; GCN-O0-NEXT: .LBB3_4: ; %bb.outer.else -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s1, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s0, 0 ; GCN-O0-NEXT: s_mov_b32 s2, s0 @@ -806,23 +682,21 @@ ; GCN-O0-NEXT: s_mov_b32 s5, s0 ; GCN-O0-NEXT: ; kill: def $sgpr4_sgpr5 killed $sgpr4_sgpr5 def $sgpr4_sgpr5_sgpr6_sgpr7 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[2:3] +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) ; GCN-O0-NEXT: buffer_store_dword v2, v[3:4], s[4:7], 0 addr64 offset:12 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 7 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 6 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 7 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB3_6 ; GCN-O0-NEXT: ; %bb.5: ; %bb.inner.then2 -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b32 s0, 0xf000 ; GCN-O0-NEXT: s_mov_b32 s2, 0 ; GCN-O0-NEXT: s_mov_b32 s4, s2 @@ -831,43 +705,27 @@ ; GCN-O0-NEXT: s_mov_b32 s1, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v0, 4 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 offset:16 +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 offset:16 ; GCN-O0-NEXT: .LBB3_6: ; %Flow -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 6 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: s_branch .LBB3_1 ; GCN-O0-NEXT: .LBB3_7: ; %Flow1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 4 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 5 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 5 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] ; GCN-O0-NEXT: .LBB3_8: ; %bb.outer.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[8:9] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: v_mov_b32_e32 v2, 3 -; GCN-O0-NEXT: v_mov_b32_e32 v1, 0 +; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 ; GCN-O0-NEXT: s_mov_b32 m0, -1 -; GCN-O0-NEXT: ds_write_b32 v1, v2 -; GCN-O0-NEXT: ; kill: killed $vgpr0 +; GCN-O0-NEXT: ds_write_b32 v0, v2 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -931,35 +789,23 @@ ; GCN-O0-NEXT: s_mov_b32 s15, 0xe8f000 ; GCN-O0-NEXT: s_add_u32 s12, s12, s11 ; GCN-O0-NEXT: s_addc_u32 s13, s13, 0 -; GCN-O0-NEXT: ; implicit-def: $vgpr1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] ; GCN-O0-NEXT: s_load_dwordx2 s[0:1], s[4:5], 0x9 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 1 -; GCN-O0-NEXT: v_mov_b32_e32 v2, v1 -; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 1 +; GCN-O0-NEXT: v_mov_b32_e32 v2, v0 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b32 s0, 1 -; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v1, s0 +; GCN-O0-NEXT: v_cmp_gt_u32_e64 s[2:3], v0, s0 ; GCN-O0-NEXT: s_mov_b64 s[0:1], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s0, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s1, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] +; GCN-O0-NEXT: v_writelane_b32 v1, s0, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s1, 3 ; GCN-O0-NEXT: s_and_b64 s[0:1], s[0:1], s[2:3] ; GCN-O0-NEXT: s_mov_b64 exec, s[0:1] ; GCN-O0-NEXT: s_cbranch_execz .LBB4_2 ; GCN-O0-NEXT: ; %bb.1: ; %bb.then ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v1, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_load_dword v2, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload ; GCN-O0-NEXT: v_readlane_b32 s0, v1, 0 ; GCN-O0-NEXT: v_readlane_b32 s1, v1, 1 ; GCN-O0-NEXT: s_mov_b32 s2, 0xf000 @@ -968,24 +814,20 @@ ; GCN-O0-NEXT: s_mov_b32 s5, s2 ; GCN-O0-NEXT: ; kill: def $sgpr0_sgpr1 killed $sgpr0_sgpr1 def $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-O0-NEXT: s_mov_b64 s[2:3], s[4:5] -; GCN-O0-NEXT: v_ashrrev_i32_e64 v2, 31, v0 -; GCN-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec -; GCN-O0-NEXT: v_mov_b32_e32 v1, v2 +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: v_ashrrev_i32_e64 v0, 31, v2 +; GCN-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; GCN-O0-NEXT: v_mov_b32_e32 v3, v0 ; GCN-O0-NEXT: s_mov_b32 s4, 2 -; GCN-O0-NEXT: v_lshl_b64 v[1:2], v[0:1], s4 +; GCN-O0-NEXT: v_lshl_b64 v[2:3], v[2:3], s4 ; GCN-O0-NEXT: v_mov_b32_e32 v0, 0 -; GCN-O0-NEXT: buffer_store_dword v0, v[1:2], s[0:3], 0 addr64 +; GCN-O0-NEXT: buffer_store_dword v0, v[2:3], s[0:3], 0 addr64 ; GCN-O0-NEXT: .LBB4_2: ; %bb.end -; GCN-O0-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[12:15], 0 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s0, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s1, v0, 3 +; GCN-O0-NEXT: v_readlane_b32 s0, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s1, v1, 3 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[0:1] +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-O0-NEXT: s_barrier -; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_endpgm bb: %tmp = tail call i32 @llvm.amdgcn.workitem.id.x() @@ -1077,127 +919,96 @@ ; GCN-O0: ; %bb.0: ; %bb ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] -; GCN-O0-NEXT: ; implicit-def: $vgpr1 -; GCN-O0-NEXT: v_mov_b32_e32 v1, v0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(1) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: s_waitcnt expcnt(1) +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 1 +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 3 ; GCN-O0-NEXT: .LBB5_1: ; %bb1 ; GCN-O0-NEXT: ; =>This Inner Loop Header: Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_readlane_b32 s8, v0, 2 -; GCN-O0-NEXT: v_readlane_b32 s9, v0, 3 -; GCN-O0-NEXT: v_readlane_b32 s6, v0, 0 -; GCN-O0-NEXT: v_readlane_b32 s7, v0, 1 -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 4 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 5 +; GCN-O0-NEXT: v_readlane_b32 s8, v1, 2 +; GCN-O0-NEXT: v_readlane_b32 s9, v1, 3 +; GCN-O0-NEXT: v_readlane_b32 s6, v1, 0 +; GCN-O0-NEXT: v_readlane_b32 s7, v1, 1 +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 4 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 5 ; GCN-O0-NEXT: s_mov_b32 s4, 0x207 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v1, s4 +; GCN-O0-NEXT: v_cmp_lt_i32_e64 s[4:5], v0, s4 ; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[8:9] -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 6 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 7 -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 0 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 1 +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 6 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 7 +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 1 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 3 ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.2: ; %bb2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) ; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_waitcnt vmcnt(1) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 6 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 7 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 6 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 7 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_mov_b32 s6, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v1, s6 -; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s6 -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 8 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 9 +; GCN-O0-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s6 +; GCN-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v0, s6 +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 8 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 9 ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_mov_b32 s8, s4 ; GCN-O0-NEXT: s_mov_b32 s9, s4 ; GCN-O0-NEXT: s_mov_b32 s10, s4 ; GCN-O0-NEXT: s_mov_b32 s11, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s11 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s11 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 10 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 11 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 10 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 11 ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_5 ; GCN-O0-NEXT: ; %bb.3: ; %bb4 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: ; implicit-def: $sgpr4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s4 -; GCN-O0-NEXT: buffer_load_dword v1, v1, s[0:3], 0 offen +; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 +; GCN-O0-NEXT: buffer_load_dword v0, v0, s[0:3], 0 offen ; GCN-O0-NEXT: s_mov_b32 s4, 0 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v1, s4 +; GCN-O0-NEXT: v_cmp_lt_f32_e64 s[6:7], v0, s4 ; GCN-O0-NEXT: s_mov_b32 s8, s4 ; GCN-O0-NEXT: s_mov_b32 s9, s4 ; GCN-O0-NEXT: s_mov_b32 s10, s4 ; GCN-O0-NEXT: s_mov_b32 s11, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s8 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s9 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s10 -; GCN-O0-NEXT: v_mov_b32_e32 v4, s11 -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v2, s8 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s9 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s10 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s11 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 12 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 13 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 12 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 13 ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_6 @@ -1215,173 +1026,135 @@ ; GCN-O0-NEXT: s_mov_b32 s6, s9 ; GCN-O0-NEXT: s_mov_b32 s7, s8 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v0, s4 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s5 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s7 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 +; GCN-O0-NEXT: v_mov_b32_e32 v3, s5 +; GCN-O0-NEXT: v_mov_b32_e32 v4, s6 +; GCN-O0-NEXT: v_mov_b32_e32 v5, s7 +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_branch .LBB5_6 ; GCN-O0-NEXT: .LBB5_5: ; %Flow2 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 10 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 11 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 10 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 11 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_branch .LBB5_7 ; GCN-O0-NEXT: .LBB5_6: ; %Flow ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 12 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 13 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 12 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 13 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_branch .LBB5_5 ; GCN-O0-NEXT: .LBB5_7: ; %bb10 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(3) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s6, v0, 8 -; GCN-O0-NEXT: v_readlane_b32 s7, v0, 9 +; GCN-O0-NEXT: v_readlane_b32 s6, v1, 8 +; GCN-O0-NEXT: v_readlane_b32 s7, v1, 9 ; GCN-O0-NEXT: s_mov_b64 s[4:5], -1 -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15 +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15 ; GCN-O0-NEXT: s_mov_b64 s[4:5], exec -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 16 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 17 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 16 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 17 ; GCN-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execz .LBB5_9 ; GCN-O0-NEXT: ; %bb.8: ; %Flow1 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_mov_b64 s[4:5], 0 ; GCN-O0-NEXT: s_xor_b64 s[4:5], exec, -1 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_writelane_b32 v0, s4, 14 -; GCN-O0-NEXT: v_writelane_b32 v0, s5, 15 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] +; GCN-O0-NEXT: v_writelane_b32 v1, s4, 14 +; GCN-O0-NEXT: v_writelane_b32 v1, s5, 15 ; GCN-O0-NEXT: .LBB5_9: ; %Flow3 ; GCN-O0-NEXT: ; in Loop: Header=BB5_1 Depth=1 ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s8, v4, 16 -; GCN-O0-NEXT: v_readlane_b32 s9, v4, 17 +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload +; GCN-O0-NEXT: v_readlane_b32 s8, v1, 16 +; GCN-O0-NEXT: v_readlane_b32 s9, v1, 17 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[8:9] -; GCN-O0-NEXT: v_readlane_b32 s6, v4, 4 -; GCN-O0-NEXT: v_readlane_b32 s7, v4, 5 -; GCN-O0-NEXT: v_readlane_b32 s4, v4, 14 -; GCN-O0-NEXT: v_readlane_b32 s5, v4, 15 +; GCN-O0-NEXT: v_readlane_b32 s6, v1, 4 +; GCN-O0-NEXT: v_readlane_b32 s7, v1, 5 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 14 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 15 ; GCN-O0-NEXT: s_and_b64 s[4:5], exec, s[4:5] ; GCN-O0-NEXT: s_or_b64 s[4:5], s[4:5], s[6:7] ; GCN-O0-NEXT: s_mov_b64 s[6:7], 0 ; GCN-O0-NEXT: s_mov_b64 s[8:9], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v4, s8, 0 -; GCN-O0-NEXT: v_writelane_b32 v4, s9, 1 -; GCN-O0-NEXT: v_writelane_b32 v4, s6, 2 -; GCN-O0-NEXT: v_writelane_b32 v4, s7, 3 +; GCN-O0-NEXT: v_writelane_b32 v1, s8, 0 +; GCN-O0-NEXT: v_writelane_b32 v1, s9, 1 +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 3 ; GCN-O0-NEXT: s_mov_b64 s[6:7], s[4:5] -; GCN-O0-NEXT: v_writelane_b32 v4, s6, 18 -; GCN-O0-NEXT: v_writelane_b32 v4, s7, 19 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: v_writelane_b32 v1, s6, 18 +; GCN-O0-NEXT: v_writelane_b32 v1, s7, 19 ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GCN-O0-NEXT: s_waitcnt vmcnt(0) +; GCN-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GCN-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; GCN-O0-NEXT: s_andn2_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: s_cbranch_execnz .LBB5_1 ; GCN-O0-NEXT: ; %bb.10: ; %bb12 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: s_waitcnt expcnt(3) -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 18 -; GCN-O0-NEXT: v_readlane_b32 s5, v0, 19 +; GCN-O0-NEXT: v_readlane_b32 s4, v1, 18 +; GCN-O0-NEXT: v_readlane_b32 s5, v1, 19 ; GCN-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; GCN-O0-NEXT: ; %bb.11: ; %bb12 -; GCN-O0-NEXT: s_or_saveexec_b64 s[14:15], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[14:15] ; GCN-O0-NEXT: s_waitcnt expcnt(0) -; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v4 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v5 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v3 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v4 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen +; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) -; GCN-O0-NEXT: v_mov_b32_e32 v5, v2 +; GCN-O0-NEXT: v_mov_b32_e32 v0, v3 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v6, s4 -; GCN-O0-NEXT: buffer_store_dword v5, v6, s[0:3], 0 offen -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec +; GCN-O0-NEXT: buffer_store_dword v0, v6, s[0:3], 0 offen +; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) +; GCN-O0-NEXT: v_mov_b32_e32 v0, v2 ; GCN-O0-NEXT: ; implicit-def: $sgpr4 ; GCN-O0-NEXT: v_mov_b32_e32 v2, s4 -; GCN-O0-NEXT: buffer_store_dword v1, v2, s[0:3], 0 offen +; GCN-O0-NEXT: buffer_store_dword v0, v2, s[0:3], 0 offen ; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: ; kill: killed $vgpr0 ; GCN-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GCN-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GCN-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload ; GCN-O0-NEXT: s_mov_b64 exec, s[4:5] ; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) ; GCN-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll --- a/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/control-flow-fastregalloc.ll @@ -10,7 +10,7 @@ ; GCN-LABEL: {{^}}divergent_if_endif: -; VGPR: workitem_private_segment_byte_size = 16{{$}} +; VGPR: workitem_private_segment_byte_size = 12{{$}} ; GCN: {{^}}; %bb.0: @@ -19,7 +19,7 @@ ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, s{{[0-9]+}} +; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, s{{[0-9]+}} ; Spill saved exec ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec @@ -82,13 +82,13 @@ } ; GCN-LABEL: {{^}}divergent_loop: -; VGPR: workitem_private_segment_byte_size = 20{{$}} +; VGPR: workitem_private_segment_byte_size = 16{{$}} ; GCN: {{^}}; %bb.0: ; GCN-DAG: s_mov_b32 m0, -1 ; GCN-DAG: v_mov_b32_e32 [[PTR0:v[0-9]+]], 0{{$}} ; GCN: ds_read_b32 [[LOAD0:v[0-9]+]], [[PTR0]] -; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v{{[0-9]+}}, s{{[0-9]+}} +; GCN: v_cmp_eq_u32_e64 [[CMP0:s\[[0-9]+:[0-9]+\]]], v0, s{{[0-9]+}} ; Spill load ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill @@ -166,7 +166,7 @@ ; GCN: buffer_store_dword [[LOAD0]], off, s[0:3], 0 offset:[[LOAD0_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; GCN: s_mov_b32 [[ZERO:s[0-9]+]], 0 -; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v{{[0-9]+}}, [[ZERO]] +; GCN: v_cmp_ne_u32_e64 [[CMP0:s\[[0-9]+:[0-9]\]]], v0, [[ZERO]] ; GCN: s_mov_b64 s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], exec ; GCN: s_and_b64 s[[[ANDEXEC_LO:[0-9]+]]:[[ANDEXEC_HI:[0-9]+]]], s[[[SAVEEXEC_LO:[0-9]+]]:[[SAVEEXEC_HI:[0-9]+]]], [[CMP0]] @@ -175,7 +175,6 @@ ; Spill saved exec ; VGPR: v_writelane_b32 [[SPILL_VGPR:v[0-9]+]], s[[SAVEEXEC_LO]], [[SAVEEXEC_LO_LANE:[0-9]+]] ; VGPR: v_writelane_b32 [[SPILL_VGPR]], s[[SAVEEXEC_HI]], [[SAVEEXEC_HI_LANE:[0-9]+]] -; VGPR: buffer_store_dword [[SPILL_VGPR]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET:[0-9]+]] ; 4-byte Folded Spill ; VMEM: v_writelane_b32 v[[V_SAVEEXEC:[0-9]+]], s[[SAVEEXEC_LO]], 0 ; VMEM: v_writelane_b32 v[[V_SAVEEXEC]], s[[SAVEEXEC_HI]], 1 @@ -188,7 +187,6 @@ ; GCN-NEXT: s_branch [[ELSE:.LBB[0-9]+_[0-9]+]] ; GCN: [[FLOW]]: ; %Flow -; VGPR: buffer_load_dword [[SPILL_VGPR:v[0-9]+]], off, s[0:3], 0 offset:[[VREG_SAVE_RESTORE_OFFSET]] ; 4-byte Folded Reload ; GCN: buffer_load_dword [[FLOW_VAL:v[0-9]+]], off, s[0:3], 0 offset:[[FLOW_VAL_OFFSET:[0-9]+]] ; 4-byte Folded Reload ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_LO:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_LO_LANE]] ; VGPR: v_readlane_b32 s[[FLOW_S_RELOAD_SAVEEXEC_HI:[0-9]+]], [[SPILL_VGPR]], [[SAVEEXEC_HI_LANE]] diff --git a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll --- a/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/cross-block-use-is-not-abi-copy.ll @@ -31,10 +31,11 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v2f32@rel32@lo+4 @@ -42,9 +43,10 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -67,10 +69,11 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v3f32@rel32@lo+4 @@ -78,9 +81,10 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -103,10 +107,11 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_v4f16@rel32@lo+4 @@ -114,9 +119,10 @@ ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -139,10 +145,11 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 2 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, func_struct@rel32@lo+4 @@ -151,9 +158,10 @@ ; GCN-NEXT: v_mov_b32_e32 v1, v4 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 2 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll --- a/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll +++ b/llvm/test/CodeGen/AMDGPU/dwarf-multi-register-use-crash.ll @@ -20,8 +20,8 @@ ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 16 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 ; CHECK-NEXT: v_writelane_b32 v40, s34, 2 @@ -48,6 +48,7 @@ ; CHECK-NEXT: v_writelane_b32 v40, s47, 15 ; CHECK-NEXT: s_load_dwordx2 s[46:47], s[4:5], 0x0 ; CHECK-NEXT: s_mov_b64 s[4:5], s[40:41] +; CHECK-NEXT: v_writelane_b32 v42, s16, 0 ; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; CHECK-NEXT: v_mov_b32_e32 v41, v31 ; CHECK-NEXT: s_mov_b32 s42, s15 @@ -91,9 +92,10 @@ ; CHECK-NEXT: v_readlane_b32 s34, v40, 2 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 16 +; CHECK-NEXT: v_readlane_b32 s4, v42, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] ; CHECK-NEXT: s_addk_i32 s32, 0xfc00 ; CHECK-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir b/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/extend-wwm-virt-reg-liveness.mir +++ /dev/null @@ -1,279 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -start-before=si-lower-sgpr-spills -stop-after=virtregrewriter,1 -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s - -# Tests to check the conservative lieness extension for the wwm registers during SGPR spill lowering. - -# Even though the VGPR can be shared for the wwm-operand (writelane/readlane get inserted for the SGPR spills) -# and the regular operand (%0), they get different registers as we conservatively extend the liveness of the -# wwm-operands. ---- -name: test_single_block -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - bb.0: - liveins: $sgpr4, $vgpr2_vgpr3 - ; GCN-LABEL: name: test_single_block - ; GCN: liveins: $sgpr4, $vgpr2_vgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0 - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec - ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: KILL killed renamable $vgpr0 - ; GCN-NEXT: SI_RETURN - SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_NOP 0 - renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec - GLOBAL_STORE_DWORD $vgpr2_vgpr3, %0:vgpr_32, 0, 0, implicit $exec - SI_RETURN -... - -# Due to the presence of wwm-operand in the divergent flow, the regular variable (%0) shouldn't get the same register -# allocated for the wwm-operand in writelane/readlane when the SGPR spill is lowered. - ---- -name: test_if_else -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - ; GCN-LABEL: name: test_if_else - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.1(0x80000000) - ; GCN-NEXT: liveins: $sgpr6, $sgpr10_sgpr11 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GCN-NEXT: S_BRANCH %bb.1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x40000000), %bb.2(0x40000000) - ; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr6, $vgpr0, $sgpr10_sgpr11 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr6, 0, killed $vgpr0 - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: - ; GCN-NEXT: liveins: $vgpr0, $vgpr1, $sgpr10_sgpr11 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec - ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0 - ; GCN-NEXT: KILL killed renamable $vgpr0 - ; GCN-NEXT: SI_RETURN - bb.0: - liveins: $sgpr6, $sgpr10_sgpr11 - S_BRANCH %bb.1 - bb.1: - liveins: $sgpr6, $sgpr10_sgpr11 - %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec - S_CBRANCH_EXECZ %bb.3, implicit $exec - bb.2: - liveins: $sgpr6, $sgpr10_sgpr11 - SI_SPILL_S32_SAVE killed $sgpr6, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_NOP 0 - renamable $sgpr6 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec - S_BRANCH %bb.3 - bb.3: - liveins: $sgpr10_sgpr11 - $sgpr5 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec - S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 0 - SI_RETURN -... - -# The wwm-register usage outside the loop should have the interference marked with -# all the regular virtual registers used in the test. The divergent loop index value (%1) -# can actually share the same VGPR as the wwm-operand. But since we extend the liveness of -# the wwm operand, an interference will always exist between them. - ---- -name: test_loop -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - ; GCN-LABEL: name: test_loop - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GCN-NEXT: liveins: $sgpr4, $sgpr10_sgpr11 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $sgpr10_sgpr11 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0 - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.2 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr1, $sgpr10_sgpr11 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0 - ; GCN-NEXT: $sgpr5 = V_READFIRSTLANE_B32 killed $vgpr1, implicit $exec - ; GCN-NEXT: S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 5, implicit $exec - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.3, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: - ; GCN-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000) - ; GCN-NEXT: liveins: $vgpr0, $vgpr1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $vcc = V_CMP_EQ_U32_e64 0, $vgpr1, implicit $exec - ; GCN-NEXT: $sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - ; GCN-NEXT: S_CBRANCH_SCC1 %bb.5, implicit $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.4: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $vgpr0, $vgpr1, $sgpr6_sgpr7 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_SUB_U32_e32 1, killed $vgpr1, implicit $exec - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 killed $vgpr1, implicit $exec - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.5: - ; GCN-NEXT: liveins: $vgpr0, $sgpr6_sgpr7 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc - ; GCN-NEXT: KILL killed renamable $vgpr0 - ; GCN-NEXT: SI_RETURN - bb.0: - liveins: $sgpr4, $sgpr10_sgpr11 - %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec - S_CBRANCH_EXECZ %bb.2, implicit $exec - bb.1: - liveins: $sgpr4, $sgpr10_sgpr11 - SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_NOP 0 - renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - %0:vgpr_32 = V_MOV_B32_e32 20, implicit $exec - S_BRANCH %bb.2 - bb.2: - liveins: $sgpr4, $sgpr10_sgpr11 - S_STORE_DWORD_IMM $sgpr4, $sgpr10_sgpr11, 0, 0 - $sgpr5 = V_READFIRSTLANE_B32 %0:vgpr_32, implicit $exec - S_STORE_DWORD_IMM $sgpr5, $sgpr10_sgpr11, 0, 4 - %1:vgpr_32 = V_MOV_B32_e32 5, implicit $exec - S_CBRANCH_EXECZ %bb.3, implicit $exec - S_BRANCH %bb.3 - bb.3: - $vcc = V_CMP_EQ_U32_e64 0, %1:vgpr_32, implicit $exec - $sgpr6_sgpr7 = S_AND_SAVEEXEC_B64 $vcc, implicit-def $exec, implicit-def $scc, implicit $exec - S_CBRANCH_SCC1 %bb.5, implicit $scc - bb.4: - liveins: $sgpr6_sgpr7 - %2:vgpr_32 = V_SUB_U32_e32 1, %1:vgpr_32, implicit $exec - %1:vgpr_32 = V_MOV_B32_e32 %2:vgpr_32, implicit $exec - S_BRANCH %bb.3 - bb.5: - liveins: $sgpr6_sgpr7 - $exec = S_OR_B64 $exec, $sgpr6_sgpr7, implicit-def $scc - SI_RETURN -... - -# There must be one KILL instruction for the wwm-operand in every return block. -# Due to that, the wwm-register allocated should be different from the ones -# allocated for the regular virtual registers. - ---- -name: test_multiple_return_blocks -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - ; GCN-LABEL: name: test_multiple_return_blocks - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GCN-NEXT: liveins: $sgpr4, $vgpr2_vgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; GCN-NEXT: S_CBRANCH_EXECZ %bb.2, implicit $exec - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: liveins: $sgpr4, $vgpr0, $vgpr2_vgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr0 - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0 - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 10, implicit $exec - ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: KILL killed renamable $vgpr0 - ; GCN-NEXT: SI_RETURN - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: liveins: $vgpr0, $vgpr2_vgpr3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = V_MOV_B32_e32 20, implicit $exec - ; GCN-NEXT: GLOBAL_STORE_DWORD $vgpr2_vgpr3, killed renamable $vgpr1, 0, 0, implicit $exec - ; GCN-NEXT: KILL killed renamable $vgpr0 - ; GCN-NEXT: SI_RETURN - bb.0: - liveins: $sgpr4, $vgpr2_vgpr3 - S_CBRANCH_EXECZ %bb.2, implicit $exec - bb.1: - liveins: $sgpr4, $vgpr2_vgpr3 - SI_SPILL_S32_SAVE killed $sgpr4, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_NOP 0 - renamable $sgpr4 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - %0:vgpr_32 = V_MOV_B32_e32 10, implicit $exec - GLOBAL_STORE_DWORD $vgpr2_vgpr3, %0:vgpr_32, 0, 0, implicit $exec - SI_RETURN - bb.2: - liveins: $vgpr2_vgpr3 - %1:vgpr_32 = V_MOV_B32_e32 20, implicit $exec - GLOBAL_STORE_DWORD $vgpr2_vgpr3, %1:vgpr_32, 0, 0, implicit $exec - SI_RETURN -... diff --git a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/fix-frame-reg-in-custom-csr-spills.ll @@ -16,9 +16,10 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: s_addk_i32 s32, 0x400 -; GCN-NEXT: v_writelane_b32 v42, s16, 2 +; GCN-NEXT: s_addk_i32 s32, 0x800 +; GCN-NEXT: v_writelane_b32 v43, s16, 0 ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, extern_func@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, extern_func@gotpcrel32@hi+12 @@ -38,11 +39,12 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: v_readlane_b32 s31, v42, 1 ; GCN-NEXT: v_readlane_b32 s30, v42, 0 -; GCN-NEXT: v_readlane_b32 s4, v42, 2 +; GCN-NEXT: v_readlane_b32 s4, v43, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_addk_i32 s32, 0xf800 ; GCN-NEXT: s_mov_b32 s33, s4 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll --- a/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll +++ b/llvm/test/CodeGen/AMDGPU/flat-scratch-init.ll @@ -117,15 +117,21 @@ ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s2 ; FLAT_SCR_OPT-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s3 ; FLAT_SCR_OPT-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; FLAT_SCR_OPT-NEXT: ; implicit-def: $vgpr0 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s4, exec_lo +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 0 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s5 ; FLAT_SCR_OPT-NEXT: s_waitcnt lgkmcnt(0) ; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s2, 0 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 4 ; FLAT_SCR_OPT-NEXT: v_writelane_b32 v0, s3, 1 -; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, 4 -; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v0, s5 ; 4-byte Folded Spill ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s5, 0 +; FLAT_SCR_OPT-NEXT: scratch_load_dword v0, off, s5 +; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) +; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s4 ; FLAT_SCR_OPT-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 ; FLAT_SCR_OPT-NEXT: ; kill: killed $sgpr0_sgpr1 ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART @@ -222,31 +228,44 @@ ; FLAT_SCR_OPT-NEXT: ;;#ASMEND ; FLAT_SCR_OPT-NEXT: ;;#ASMSTART ; FLAT_SCR_OPT-NEXT: ;;#ASMEND -; FLAT_SCR_OPT-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_OPT-NEXT: s_mov_b32 s0, 4 -; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload +; FLAT_SCR_OPT-NEXT: s_mov_b32 s2, exec_lo +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, 3 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 +; FLAT_SCR_OPT-NEXT: scratch_store_dword off, v1, s3 +; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 4 +; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload ; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s105 +; FLAT_SCR_OPT-NEXT: s_mov_b32 s3, 0 ; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) ; FLAT_SCR_OPT-NEXT: v_readlane_b32 s0, v1, 0 ; FLAT_SCR_OPT-NEXT: v_readlane_b32 s1, v1, 1 -; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_OPT-NEXT: ; kill: killed $vgpr1 -; FLAT_SCR_OPT-NEXT: global_store_dword v2, v0, s[0:1] +; FLAT_SCR_OPT-NEXT: scratch_load_dword v1, off, s3 +; FLAT_SCR_OPT-NEXT: s_waitcnt vmcnt(0) +; FLAT_SCR_OPT-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_OPT-NEXT: s_mov_b32 exec_lo, s2 +; FLAT_SCR_OPT-NEXT: v_mov_b32_e32 v1, 0 +; FLAT_SCR_OPT-NEXT: global_store_dword v1, v0, s[0:1] ; FLAT_SCR_OPT-NEXT: s_endpgm ; ; FLAT_SCR_ARCH-LABEL: test: ; FLAT_SCR_ARCH: ; %bb.0: ; FLAT_SCR_ARCH-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 -; FLAT_SCR_ARCH-NEXT: ; implicit-def: $vgpr0 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s4, exec_lo +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 0 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s5 ; FLAT_SCR_ARCH-NEXT: s_waitcnt lgkmcnt(0) ; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s2, 0 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 4 ; FLAT_SCR_ARCH-NEXT: v_writelane_b32 v0, s3, 1 -; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, 4 -; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s2 ; 4-byte Folded Spill +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v0, s5 ; 4-byte Folded Spill ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s5, 0 +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v0, off, s5 +; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) +; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s4 ; FLAT_SCR_ARCH-NEXT: s_load_dword vcc_lo, s[0:1], 0x8 ; FLAT_SCR_ARCH-NEXT: ; kill: killed $sgpr0_sgpr1 ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART @@ -343,17 +362,24 @@ ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND ; FLAT_SCR_ARCH-NEXT: ;;#ASMSTART ; FLAT_SCR_ARCH-NEXT: ;;#ASMEND -; FLAT_SCR_ARCH-NEXT: s_or_saveexec_b32 s105, -1 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 s0, 4 -; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s0 ; 4-byte Folded Reload +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s2, exec_lo +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, 3 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 +; FLAT_SCR_ARCH-NEXT: scratch_store_dword off, v1, s3 +; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 4 +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s3 ; 4-byte Folded Reload ; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 -; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s105 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 s3, 0 ; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) ; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s0, v1, 0 ; FLAT_SCR_ARCH-NEXT: v_readlane_b32 s1, v1, 1 -; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v2, 0 -; FLAT_SCR_ARCH-NEXT: ; kill: killed $vgpr1 -; FLAT_SCR_ARCH-NEXT: global_store_dword v2, v0, s[0:1] +; FLAT_SCR_ARCH-NEXT: scratch_load_dword v1, off, s3 +; FLAT_SCR_ARCH-NEXT: s_waitcnt vmcnt(0) +; FLAT_SCR_ARCH-NEXT: s_waitcnt_depctr 0xffe3 +; FLAT_SCR_ARCH-NEXT: s_mov_b32 exec_lo, s2 +; FLAT_SCR_ARCH-NEXT: v_mov_b32_e32 v1, 0 +; FLAT_SCR_ARCH-NEXT: global_store_dword v1, v0, s[0:1] ; FLAT_SCR_ARCH-NEXT: s_endpgm call void asm sideeffect "", "~{s[0:7]}" () call void asm sideeffect "", "~{s[8:15]}" () diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-exec.mir @@ -12,13 +12,14 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_lo - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $exec_lo ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_lo - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_lo @@ -37,13 +38,14 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec_hi - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $exec_hi ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $exec_hi - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec_hi @@ -62,16 +64,17 @@ body: | bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_exec - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $exec ; CHECK-NEXT: $sgpr0_sgpr1 = S_MOV_B64 $exec - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1, implicit killed renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def $exec @@ -93,12 +96,13 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_lo - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_lo - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_lo = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_lo @@ -116,12 +120,13 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec_hi - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $exec_hi - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $exec_hi = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_32, implicit-def %1:sreg_32, implicit-def $exec_hi @@ -139,15 +144,16 @@ body: | bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_exec - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def $exec - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, killed $vgpr0, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1, implicit $sgpr0_sgpr1 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr1, 1, $vgpr0, implicit $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0_sgpr1, implicit-def dead renamable $sgpr2_sgpr3, implicit-def dead renamable $sgpr0_sgpr1 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr0_sgpr1 - ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 killed $vgpr0, 1 + ; CHECK-NEXT: $sgpr1 = V_READLANE_B32 $vgpr0, 1 ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec S_NOP 0, implicit-def %0:sreg_64, implicit-def %1:sreg_64, implicit-def $exec diff --git a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir --- a/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir +++ b/llvm/test/CodeGen/AMDGPU/fold-reload-into-m0.mir @@ -13,13 +13,14 @@ bb.0: ; CHECK-LABEL: name: merge_sgpr_spill_into_copy_from_m0 - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def $m0 ; CHECK-NEXT: $sgpr0 = S_MOV_B32 $m0 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0, implicit killed renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec @@ -43,12 +44,13 @@ bb.0: ; CHECK-LABEL: name: reload_sgpr_spill_into_copy_to_m0 - ; CHECK: renamable $vgpr0 = IMPLICIT_DEF + ; CHECK: liveins: $vgpr0 + ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_NOP 0, implicit-def renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def $m0 - ; CHECK-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, killed $vgpr0 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr0, 0, $vgpr0 ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: S_NOP 0, implicit killed renamable $sgpr0, implicit-def dead renamable $sgpr1, implicit-def dead renamable $sgpr0 - ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 killed $vgpr0, 0 + ; CHECK-NEXT: $sgpr0 = V_READLANE_B32 $vgpr0, 0 ; CHECK-NEXT: $m0 = S_MOV_B32 killed $sgpr0 ; CHECK-NEXT: S_NOP 0 ; CHECK-NEXT: S_SENDMSG 0, implicit $m0, implicit $exec diff --git a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/frame-setup-without-sgpr-to-vgpr-spills.ll @@ -14,11 +14,12 @@ ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s32 ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[8:9], -1 ; SPILL-TO-VGPR-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; SPILL-TO-VGPR-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[8:9] -; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s4, 2 ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0x400 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s30, 0 ; SPILL-TO-VGPR-NEXT: v_mov_b32_e32 v0, 0 +; SPILL-TO-VGPR-NEXT: v_writelane_b32 v41, s4, 0 ; SPILL-TO-VGPR-NEXT: v_writelane_b32 v40, s31, 1 ; SPILL-TO-VGPR-NEXT: buffer_store_dword v0, off, s[0:3], s33 ; SPILL-TO-VGPR-NEXT: s_waitcnt vmcnt(0) @@ -28,9 +29,10 @@ ; SPILL-TO-VGPR-NEXT: s_swappc_b64 s[30:31], s[4:5] ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s31, v40, 1 ; SPILL-TO-VGPR-NEXT: v_readlane_b32 s30, v40, 0 -; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v40, 2 +; SPILL-TO-VGPR-NEXT: v_readlane_b32 s4, v41, 0 ; SPILL-TO-VGPR-NEXT: s_or_saveexec_b64 s[6:7], -1 ; SPILL-TO-VGPR-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; SPILL-TO-VGPR-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; SPILL-TO-VGPR-NEXT: s_mov_b64 exec, s[6:7] ; SPILL-TO-VGPR-NEXT: s_addk_i32 s32, 0xfc00 ; SPILL-TO-VGPR-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-call-non-gfx-func.ll @@ -8,7 +8,7 @@ ; SDAG-LABEL: gfx_func: ; SDAG: ; %bb.0: ; SDAG-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; SDAG-NEXT: s_mov_b32 s36, s33 +; SDAG-NEXT: s_mov_b32 s38, s33 ; SDAG-NEXT: s_mov_b32 s33, s32 ; SDAG-NEXT: s_or_saveexec_b64 s[34:35], -1 ; SDAG-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -81,14 +81,14 @@ ; SDAG-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; SDAG-NEXT: s_mov_b64 exec, s[34:35] ; SDAG-NEXT: s_addk_i32 s32, 0xfc00 -; SDAG-NEXT: s_mov_b32 s33, s36 +; SDAG-NEXT: s_mov_b32 s33, s38 ; SDAG-NEXT: s_waitcnt vmcnt(0) ; SDAG-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: gfx_func: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s36, s33 +; GISEL-NEXT: s_mov_b32 s38, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -161,7 +161,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[34:35] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s36 +; GISEL-NEXT: s_mov_b32 s33, s38 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call void @extern_c_func() diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-argument-types.ll @@ -101,11 +101,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 @@ -114,9 +115,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -130,23 +132,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -160,24 +165,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -191,23 +199,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -226,12 +237,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 @@ -241,9 +253,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -257,25 +270,28 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -289,25 +305,29 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -321,25 +341,28 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -359,12 +382,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 @@ -374,9 +398,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -390,25 +415,28 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -422,25 +450,29 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -454,25 +486,28 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_and_b32_e32 v0, 1, v0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -492,11 +527,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8@rel32@lo+4 @@ -504,9 +540,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -520,22 +557,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -549,23 +589,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -579,22 +623,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -613,12 +660,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_sbyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 @@ -626,9 +674,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -642,23 +691,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -672,24 +724,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_i8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -703,23 +759,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_sbyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -739,12 +798,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ubyte v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 @@ -752,9 +812,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -768,23 +829,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -798,24 +862,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u8 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -829,23 +897,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -865,11 +936,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16@rel32@lo+4 @@ -877,9 +949,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -893,22 +966,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -922,23 +998,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -952,22 +1032,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -986,12 +1069,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 @@ -999,9 +1083,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1015,23 +1100,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_signext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_signext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1045,24 +1133,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1076,23 +1168,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_signext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_signext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1112,12 +1207,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_ushort v0, v[0:1], off glc ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 @@ -1125,9 +1221,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1141,23 +1238,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_zeroext@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1171,24 +1271,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_u16 v0, v[0:1], off glc dlc ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1202,23 +1306,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_ushort v0, v[0:1], off glc dlc ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_zeroext@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_zeroext@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1238,11 +1345,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i32@rel32@lo+4 @@ -1250,9 +1358,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1266,22 +1375,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1295,23 +1407,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1325,22 +1441,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1359,12 +1478,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX9-NEXT: v_mov_b32_e32 v1, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 @@ -1372,9 +1492,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1388,23 +1509,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1418,23 +1542,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x7b :: v_dual_mov_b32 v1, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1448,24 +1576,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x7b ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload -; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 +; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 @@ -1483,13 +1614,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 @@ -1497,9 +1629,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1513,24 +1646,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1544,25 +1680,29 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1576,24 +1716,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1613,14 +1756,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 @@ -1628,9 +1772,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1644,25 +1789,28 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1676,24 +1824,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1707,25 +1859,28 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1744,15 +1899,16 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i64@rel32@lo+4 @@ -1760,9 +1916,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1776,26 +1933,29 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1809,25 +1969,29 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 -; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1841,26 +2005,29 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -1882,17 +2049,18 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v4, 1 ; GFX9-NEXT: v_mov_b32_e32 v5, 2 ; GFX9-NEXT: v_mov_b32_e32 v6, 3 ; GFX9-NEXT: v_mov_b32_e32 v7, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i64@rel32@lo+4 @@ -1900,9 +2068,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1916,28 +2085,31 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v7, 4 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1951,26 +2123,30 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v5, 2 ; GFX11-NEXT: v_dual_mov_b32 v1, 0 :: v_dual_mov_b32 v4, 1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v6, 3 :: v_dual_mov_b32 v7, 4 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1984,28 +2160,31 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 3 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 4 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2026,11 +2205,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x4400 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f16@rel32@lo+4 @@ -2038,9 +2218,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2054,22 +2235,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2083,23 +2267,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2113,22 +2301,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x4400 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2147,11 +2338,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 4.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f32@rel32@lo+4 @@ -2159,9 +2351,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2175,22 +2368,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2204,23 +2400,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2234,22 +2434,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 4.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2268,12 +2471,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 @@ -2281,9 +2485,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2297,23 +2502,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2327,23 +2535,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2357,23 +2569,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2392,13 +2607,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f32@rel32@lo+4 @@ -2406,9 +2622,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2422,24 +2639,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2453,24 +2673,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2484,24 +2708,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2520,8 +2747,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1.0 @@ -2529,6 +2756,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX9-NEXT: v_mov_b32_e32 v3, -1.0 ; GFX9-NEXT: v_mov_b32_e32 v4, 0.5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5f32@rel32@lo+4 @@ -2536,9 +2764,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2552,26 +2781,29 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-NEXT: v_mov_b32_e32 v3, -1.0 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2585,25 +2817,29 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1.0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 4.0 :: v_dual_mov_b32 v3, -1.0 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2617,26 +2853,29 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 4.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, -1.0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0.5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2655,12 +2894,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 @@ -2668,9 +2908,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2684,23 +2925,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2714,23 +2958,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0x40100000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2744,23 +2992,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40100000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2779,14 +3030,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f64@rel32@lo+4 @@ -2794,9 +3046,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2810,25 +3063,28 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2842,24 +3098,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -2873,25 +3133,28 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -2910,8 +3173,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 @@ -2920,6 +3183,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v3, 0x40100000 ; GFX9-NEXT: v_mov_b32_e32 v4, 0 ; GFX9-NEXT: v_mov_b32_e32 v5, 0x40200000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f64@rel32@lo+4 @@ -2927,9 +3191,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -2943,27 +3208,30 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-NEXT: v_mov_b32_e32 v5, 0x40200000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f64@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -2977,25 +3245,29 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 2.0 ; GFX11-NEXT: v_dual_mov_b32 v2, 0 :: v_dual_mov_b32 v3, 0x40100000 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 0 :: v_dual_mov_b32 v5, 0x40200000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3009,27 +3281,30 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2.0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 0x40100000 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 0x40200000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3048,11 +3323,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 @@ -3060,9 +3336,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3076,22 +3353,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3105,23 +3385,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3135,22 +3419,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3170,11 +3457,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 @@ -3182,9 +3470,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3198,22 +3487,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3227,23 +3519,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3257,22 +3553,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3292,11 +3591,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 @@ -3304,9 +3604,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3320,22 +3621,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3349,23 +3653,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3379,22 +3687,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3414,12 +3725,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 3 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 @@ -3427,9 +3739,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3443,23 +3756,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3473,23 +3789,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0x20001 :: v_dual_mov_b32 v1, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3503,23 +3823,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3538,12 +3861,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x4400 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 @@ -3551,9 +3875,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3567,23 +3892,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3597,24 +3925,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3628,23 +3960,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x40003c00 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x4400 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3663,11 +3998,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 @@ -3675,9 +4011,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3691,22 +4028,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3720,23 +4060,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3750,22 +4094,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3785,12 +4132,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX9-NEXT: v_mov_b32_e32 v1, 0x40003 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 @@ -3798,9 +4146,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3814,23 +4163,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3844,24 +4196,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3875,23 +4231,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x20001 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x40003 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -3910,11 +4269,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dword v0, v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 @@ -3922,9 +4282,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -3938,22 +4299,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dword v0, v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -3967,23 +4331,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b32 v0, v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -3997,22 +4365,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dword v0, v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4032,11 +4403,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 @@ -4044,9 +4416,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4060,22 +4433,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4089,23 +4465,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b64 v[0:1], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4119,22 +4499,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx2 v[0:1], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4154,12 +4537,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 @@ -4167,9 +4551,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4183,23 +4568,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4213,23 +4601,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4243,23 +4635,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4278,13 +4673,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32@rel32@lo+4 @@ -4292,9 +4688,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4308,24 +4705,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4339,24 +4739,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_mov_b32_e32 v2, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4370,24 +4774,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4406,14 +4813,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: v_mov_b32_e32 v1, 4 ; GFX9-NEXT: v_mov_b32_e32 v2, 5 ; GFX9-NEXT: v_mov_b32_e32 v3, 6 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32@rel32@lo+4 @@ -4421,9 +4829,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4437,25 +4846,28 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4469,24 +4881,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 4 ; GFX11-NEXT: v_dual_mov_b32 v2, 5 :: v_dual_mov_b32 v3, 6 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4500,25 +4916,28 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 6 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4537,11 +4956,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 @@ -4549,9 +4969,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4565,22 +4986,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4594,23 +5018,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: global_load_b128 v[0:3], v[0:1], off -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4624,22 +5052,25 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v[0:1], off -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4659,14 +5090,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 ; GFX9-NEXT: v_mov_b32_e32 v1, 2 ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 @@ -4674,9 +5106,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4690,25 +5123,28 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4722,24 +5158,28 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4753,25 +5193,28 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4790,8 +5233,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -4799,6 +5242,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v2, 3 ; GFX9-NEXT: v_mov_b32_e32 v3, 4 ; GFX9-NEXT: v_mov_b32_e32 v4, 5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5i32@rel32@lo+4 @@ -4806,9 +5250,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4822,26 +5267,29 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4855,25 +5303,29 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 5 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -4887,26 +5339,29 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -4925,8 +5380,9 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v8, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -4941,9 +5397,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -4957,27 +5414,30 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_dwordx4 v[0:3], v8, s[34:35] ; GFX10-NEXT: global_load_dwordx4 v[4:7], v8, s[34:35] offset:16 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -4991,28 +5451,32 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_b128 v[0:3], v4, s[0:1] ; GFX11-NEXT: global_load_b128 v[4:7], v4, s[0:1] offset:16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5026,27 +5490,30 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v8, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v8, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v8, s[0:1] offset:16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5067,8 +5534,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 @@ -5079,6 +5546,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v5, 6 ; GFX9-NEXT: v_mov_b32_e32 v6, 7 ; GFX9-NEXT: v_mov_b32_e32 v7, 8 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 @@ -5086,9 +5554,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5102,19 +5571,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-NEXT: v_mov_b32_e32 v7, 8 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32@rel32@lo+4 @@ -5122,9 +5592,11 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5138,15 +5610,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v0, 1 :: v_dual_mov_b32 v1, 2 ; GFX11-NEXT: v_dual_mov_b32 v2, 3 :: v_dual_mov_b32 v3, 4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v4, 5 :: v_dual_mov_b32 v5, 6 ; GFX11-NEXT: v_dual_mov_b32 v6, 7 :: v_dual_mov_b32 v7, 8 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 @@ -5155,9 +5629,11 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5171,19 +5647,20 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v3, 4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v4, 5 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v5, 6 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 7 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 8 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32@rel32@lo+4 @@ -5191,9 +5668,11 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5212,8 +5691,9 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v16, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -5230,9 +5710,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5246,11 +5727,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v16, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x3 @@ -5258,17 +5741,18 @@ ; GFX10-NEXT: global_load_dwordx4 v[4:7], v16, s[34:35] offset:16 ; GFX10-NEXT: global_load_dwordx4 v[8:11], v16, s[34:35] offset:32 ; GFX10-NEXT: global_load_dwordx4 v[12:15], v16, s[34:35] offset:48 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v16i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v16i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5282,11 +5766,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v12, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x3 @@ -5294,18 +5781,19 @@ ; GFX11-NEXT: global_load_b128 v[4:7], v12, s[0:1] offset:16 ; GFX11-NEXT: global_load_b128 v[8:11], v12, s[0:1] offset:32 ; GFX11-NEXT: global_load_b128 v[12:15], v12, s[0:1] offset:48 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5319,11 +5807,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v16, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x3 @@ -5331,17 +5821,18 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[4:7], v16, s[0:1] offset:16 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[8:11], v16, s[0:1] offset:32 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[12:15], v16, s[0:1] offset:48 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v16i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v16i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5362,8 +5853,9 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -5385,9 +5877,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5401,11 +5894,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x7 @@ -5417,17 +5912,18 @@ ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5441,11 +5937,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x7 @@ -5457,18 +5956,19 @@ ; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80 ; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96 ; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5482,11 +5982,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x7 @@ -5498,17 +6000,18 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5529,8 +6032,9 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v28, 0 ; GFX9-NEXT: global_load_dword v32, v[0:1], off @@ -5555,9 +6059,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5571,11 +6076,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v32, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: global_load_dword v33, v[0:1], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -5588,19 +6095,20 @@ ; GFX10-NEXT: global_load_dwordx4 v[20:23], v32, s[34:35] offset:80 ; GFX10-NEXT: global_load_dwordx4 v[24:27], v32, s[34:35] offset:96 ; GFX10-NEXT: global_load_dwordx4 v[28:31], v32, s[34:35] offset:112 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v32i32_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(8) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5614,11 +6122,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v28, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: global_load_b32 v32, v[0:1], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -5631,19 +6142,20 @@ ; GFX11-NEXT: global_load_b128 v[20:23], v28, s[0:1] offset:80 ; GFX11-NEXT: global_load_b128 v[24:27], v28, s[0:1] offset:96 ; GFX11-NEXT: global_load_b128 v[28:31], v28, s[0:1] offset:112 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(8) ; GFX11-NEXT: scratch_store_b32 off, v32, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5657,11 +6169,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v32, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: global_load_dword v33, v[0:1], off ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) @@ -5674,19 +6188,20 @@ ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[20:23], v32, s[0:1] offset:80 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[24:27], v32, s[0:1] offset:96 ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[28:31], v32, s[0:1] offset:112 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v32i32_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v32i32_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(8) ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v33, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -5708,14 +6223,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v41, v0 ; GFX9-NEXT: v_mov_b32_e32 v0, 42 +; GFX9-NEXT: v_writelane_b32 v43, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: v_mov_b32_e32 v42, v1 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -5728,11 +6244,12 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v43, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -5744,20 +6261,21 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v43, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v43, s34, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_i32_func_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_i32_func_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: global_store_dword v[41:42], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5766,12 +6284,14 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v43, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -5782,20 +6302,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v43, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: scratch_store_b32 off, v42, s33 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_dual_mov_b32 v42, v1 :: v_dual_mov_b32 v41, v0 ; GFX11-NEXT: v_mov_b32_e32 v0, 42 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v43, s0, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: global_store_b32 v[41:42], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5804,11 +6327,13 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v43, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v43, off, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -5820,20 +6345,21 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v43, s33 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v42, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v41, v0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 42 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v43, s0, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v42, v1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_i32_func_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_i32_func_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: global_store_dword v[41:42], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 @@ -5842,12 +6368,14 @@ ; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v43, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v43, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -5864,8 +6392,9 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v2, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -5880,9 +6409,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -5896,27 +6426,30 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: global_load_ubyte v0, v2, s[34:35] ; GFX10-NEXT: global_load_dword v1, v2, s[34:35] offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -5930,28 +6463,32 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: global_load_u8 v0, v1, s[0:1] ; GFX11-NEXT: global_load_b32 v1, v1, s[0:1] offset:4 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -5965,27 +6502,30 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: s_clause 0x1 ; GFX10-SCRATCH-NEXT: global_load_ubyte v0, v2, s[0:1] ; GFX10-SCRATCH-NEXT: global_load_dword v1, v2, s[0:1] offset:4 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_struct_i8_i32@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -6006,15 +6546,16 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 @@ -6022,11 +6563,12 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -6038,29 +6580,32 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: s_getpc_b64 s[34:35] -; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_getpc_b64 s[34:35] +; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -6071,29 +6616,33 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 -; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: s_getpc_b64 s[0:1] -; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_clause 0x1 ; GFX11-NEXT: scratch_store_b8 off, v0, s33 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 offset:4 ; GFX11-NEXT: v_mov_b32_e32 v0, s33 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_getpc_b64 s[0:1] +; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -6105,29 +6654,32 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] -; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 -; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s33 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v1, s33 offset:4 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, s33 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] +; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_byval_struct_i8_i32@rel32@lo+4 +; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_byval_struct_i8_i32@rel32@hi+12 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -6148,17 +6700,18 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: v_mov_b32_e32 v0, 3 ; GFX9-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_add_u32_e32 v0, 8, v0 ; GFX9-NEXT: v_lshrrev_b32_e64 v1, 6, s33 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 @@ -6168,7 +6721,7 @@ ; GFX9-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: global_store_byte v[0:1], v0, off ; GFX9-NEXT: s_waitcnt vmcnt(0) @@ -6176,6 +6729,7 @@ ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -6189,36 +6743,39 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-NEXT: v_mov_b32_e32 v1, 8 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: s_getpc_b64 s[34:35] -; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 -; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s33 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_lshrrev_b32_e64 v1, 5, s33 -; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: s_getpc_b64 s[34:35] +; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 +; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 +; GFX10-NEXT: v_add_nc_u32_e32 v0, 8, v0 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_ubyte v0, off, s[0:3], s33 offset:8 ; GFX10-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_byte v[0:1], v0, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: global_store_dword v[0:1], v1, off ; GFX10-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 @@ -6232,11 +6789,13 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 3 :: v_dual_mov_b32 v1, 8 ; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 @@ -6253,14 +6812,16 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b8 v[0:1], v0, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: global_store_b32 v[0:1], v1, off dlc ; GFX11-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6274,12 +6835,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:16 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:20 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_sret_struct_i8_i32_byval_struct_i8_i32@rel32@hi+12 @@ -6296,14 +6858,16 @@ ; GFX10-SCRATCH-NEXT: scratch_load_dword v1, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: global_store_byte v[0:1], v0, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: global_store_dword v[0:1], v1, off ; GFX10-SCRATCH-NEXT: s_waitcnt_vscnt null, 0x0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:16 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:16 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:20 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 @@ -6335,8 +6899,9 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_mov_b32_e32 v0, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 @@ -6369,9 +6934,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -6385,13 +6951,14 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: global_load_dwordx4 v[0:3], v0, s[34:35] @@ -6421,9 +6988,11 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -6437,13 +7006,15 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_mov_b32_e32 v0, 0 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: global_load_b128 v[0:3], v0, s[0:1] @@ -6470,9 +7041,11 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -6486,13 +7059,14 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-SCRATCH-NEXT: global_load_dwordx4 v[0:3], v0, s[0:1] @@ -6522,9 +7096,11 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -6541,7 +7117,7 @@ ; GFX9-LABEL: tail_call_byval_align16: ; GFX9: ; %bb.0: ; %entry ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_mov_b32 s6, s33 +; GFX9-NEXT: s_mov_b32 s8, s33 ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill @@ -6626,14 +7202,14 @@ ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[4:5] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 -; GFX9-NEXT: s_mov_b32 s33, s6 +; GFX9-NEXT: s_mov_b32 s33, s8 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: tail_call_byval_align16: ; GFX10: ; %bb.0: ; %entry ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_mov_b32 s6, s33 +; GFX10-NEXT: s_mov_b32 s7, s33 ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s4, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill @@ -6721,14 +7297,14 @@ ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s4 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 -; GFX10-NEXT: s_mov_b32 s33, s6 +; GFX10-NEXT: s_mov_b32 s33, s7 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: tail_call_byval_align16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s4, s33 +; GFX11-NEXT: s_mov_b32 s5, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:24 ; 4-byte Folded Spill @@ -6811,14 +7387,14 @@ ; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:24 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 -; GFX11-NEXT: s_mov_b32 s33, s4 +; GFX11-NEXT: s_mov_b32 s33, s5 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-SCRATCH-LABEL: tail_call_byval_align16: ; GFX10-SCRATCH: ; %bb.0: ; %entry ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, s33 +; GFX10-SCRATCH-NEXT: s_mov_b32 s5, s33 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s0, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:24 ; 4-byte Folded Spill @@ -6903,7 +7479,7 @@ ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s0 ; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s4 +; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s5 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] entry: @@ -6921,11 +7497,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_mov_b32_e32 v0, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 @@ -6934,9 +7511,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -6950,23 +7528,26 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-NEXT: buffer_store_byte v0, off, s[0:3], s32 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -6980,24 +7561,27 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_mov_b32_e32 v0, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 -; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: scratch_store_b8 off, v0, s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7011,23 +7595,26 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i1_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i1_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 -; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 +; GFX10-SCRATCH-NEXT: scratch_store_byte off, v0, s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7046,12 +7633,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 @@ -7060,9 +7648,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7076,24 +7665,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i8_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7107,15 +7699,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7123,9 +7717,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7139,24 +7735,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i8_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i8_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7175,12 +7774,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x7b +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 @@ -7189,9 +7789,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7205,24 +7806,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7236,15 +7840,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7252,9 +7858,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7268,24 +7876,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7304,12 +7915,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s4, 42 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 @@ -7318,9 +7930,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7334,24 +7947,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 42 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 42 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7365,15 +7981,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 42 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 42 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -7381,9 +7999,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7397,24 +8017,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 42 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7433,14 +8056,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_movk_i32 s4, 0x7b ; GFX9-NEXT: s_mov_b32 s5, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 @@ -7450,9 +8074,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7466,15 +8091,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x7b ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -7484,9 +8110,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7500,15 +8128,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x7b ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x7b ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -7519,9 +8149,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7535,15 +8167,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x7b ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -7553,9 +8186,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7574,10 +8209,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -7595,9 +8231,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7611,12 +8248,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -7633,9 +8271,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7649,12 +8289,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -7672,9 +8314,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7688,12 +8332,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -7710,9 +8355,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7732,8 +8379,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -7744,6 +8391,7 @@ ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 @@ -7755,9 +8403,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7771,15 +8420,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -7795,9 +8445,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7811,15 +8463,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -7836,9 +8490,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -7852,15 +8508,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -7876,9 +8533,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -7897,10 +8556,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -7924,9 +8584,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 8 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -7940,12 +8601,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 8 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -7968,9 +8630,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 8 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -7984,12 +8648,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 8 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -8013,9 +8679,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 8 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8029,12 +8697,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8057,9 +8726,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8081,11 +8752,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 ; GFX9-NEXT: s_mov_b64 s[34:35], 0 ; GFX9-NEXT: v_writelane_b32 v40, s8, 4 @@ -8114,9 +8786,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 10 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8130,12 +8803,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 10 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_mov_b64 s[34:35], 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -8164,9 +8838,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 10 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8180,12 +8856,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 10 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_mov_b64 s[0:1], 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -8215,9 +8893,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 10 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8231,12 +8911,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_mov_b64 s[0:1], 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -8265,9 +8946,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8288,12 +8971,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_movk_i32 s4, 0x4400 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 @@ -8302,9 +8986,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8318,24 +9003,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_movk_i32 s4, 0x4400 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_movk_i32 s4, 0x4400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8349,15 +9037,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_movk_i32 s4, 0x4400 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_movk_i32 s4, 0x4400 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8365,9 +9055,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8381,24 +9073,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_movk_i32 s4, 0x4400 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8417,12 +9112,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: s_mov_b32 s4, 4.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 @@ -8431,9 +9127,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8447,24 +9144,27 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 4.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 4.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8478,15 +9178,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 4.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 4.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -8494,9 +9196,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8510,24 +9214,27 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 4.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8546,14 +9253,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 @@ -8563,9 +9271,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8579,15 +9288,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -8597,9 +9307,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8613,15 +9325,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -8632,9 +9346,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8648,15 +9364,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -8666,9 +9383,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8687,8 +9406,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 5 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -8697,6 +9416,7 @@ ; GFX9-NEXT: s_mov_b32 s4, 1.0 ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 4.0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 @@ -8707,9 +9427,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 5 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8723,15 +9444,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 5 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -8744,9 +9466,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 5 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8760,15 +9484,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 5 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -8782,9 +9508,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 5 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8798,15 +9526,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -8819,9 +9548,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -8840,8 +9571,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -8854,6 +9585,7 @@ ; GFX9-NEXT: s_mov_b32 s6, 4.0 ; GFX9-NEXT: s_mov_b32 s7, -1.0 ; GFX9-NEXT: s_mov_b32 s8, 0.5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 @@ -8866,9 +9598,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 7 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -8882,15 +9615,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 7 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1.0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -8909,9 +9643,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 7 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -8925,15 +9661,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 7 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1.0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1.0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -8953,9 +9691,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 7 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -8969,15 +9709,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5f32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5f32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -8996,9 +9737,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9017,14 +9760,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0 ; GFX9-NEXT: s_mov_b32 s5, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 @@ -9034,9 +9778,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9050,15 +9795,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -9068,9 +9814,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9084,15 +9832,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 0x40100000 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -9103,9 +9853,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9119,15 +9871,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40100000 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -9137,9 +9890,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9158,8 +9913,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -9170,6 +9925,7 @@ ; GFX9-NEXT: s_mov_b32 s5, 2.0 ; GFX9-NEXT: s_mov_b32 s6, 0 ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 @@ -9181,9 +9937,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9197,15 +9954,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -9221,9 +9979,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9237,15 +9997,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -9262,9 +10024,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9278,15 +10042,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -9302,9 +10067,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9323,8 +10090,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 8 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -9339,6 +10106,7 @@ ; GFX9-NEXT: s_mov_b32 s7, 0x40100000 ; GFX9-NEXT: s_mov_b32 s8, 0 ; GFX9-NEXT: s_mov_b32 s9, 0x40200000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 7 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 @@ -9352,9 +10120,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 8 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9368,15 +10137,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 8 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2.0 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -9398,9 +10168,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 8 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9414,15 +10186,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 8 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2.0 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -9445,9 +10219,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 8 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9461,15 +10237,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 8 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f64_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f64_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2.0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -9491,9 +10268,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 8 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9512,12 +10291,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 @@ -9526,9 +10306,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9542,12 +10323,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i16_inreg@rel32@hi+12 @@ -9557,9 +10339,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9573,12 +10357,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 @@ -9589,9 +10375,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9605,12 +10393,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i16_inreg@rel32@hi+12 @@ -9620,9 +10409,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9642,13 +10433,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 @@ -9658,9 +10450,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9674,11 +10467,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -9691,9 +10485,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9707,11 +10503,13 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -9725,9 +10523,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9741,11 +10541,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -9758,9 +10559,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9780,13 +10583,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 @@ -9796,9 +10600,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9812,11 +10617,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -9829,9 +10635,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9845,11 +10653,13 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -9863,9 +10673,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -9879,11 +10691,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -9896,9 +10709,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -9918,14 +10733,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 3 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 @@ -9935,9 +10751,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -9951,15 +10768,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 3 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -9969,9 +10787,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -9985,15 +10805,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x20001 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 3 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -10004,9 +10826,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10020,15 +10844,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -10038,9 +10863,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10059,14 +10886,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX9-NEXT: s_movk_i32 s5, 0x4400 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 @@ -10076,9 +10904,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10092,15 +10921,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -10110,9 +10940,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10126,15 +10958,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_movk_i32 s5, 0x4400 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -10145,9 +10979,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10161,15 +10997,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3f16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x40003c00 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_movk_i32 s5, 0x4400 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -10179,9 +11016,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10200,13 +11039,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 @@ -10216,9 +11056,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10232,11 +11073,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -10249,9 +11091,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10265,11 +11109,13 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -10283,9 +11129,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10299,11 +11147,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -10316,9 +11165,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10338,14 +11189,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 0x20001 ; GFX9-NEXT: s_mov_b32 s5, 0x40003 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 @@ -10355,9 +11207,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10371,15 +11224,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -10389,9 +11243,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10405,15 +11261,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 0x20001 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 0x20001 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 0x40003 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -10424,9 +11282,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10440,15 +11300,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i16_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 0x20001 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 0x40003 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -10458,9 +11319,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10479,12 +11342,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_load_dword s4, s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 @@ -10493,9 +11357,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10509,12 +11374,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_load_dword s4, s[34:35], 0x0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2f16_inreg@rel32@hi+12 @@ -10524,9 +11390,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10540,12 +11408,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_load_b32 s4, s[0:1], 0x0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 @@ -10556,9 +11426,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10572,12 +11444,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 3 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: s_load_dword s4, s[0:1], 0x0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2f16_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2f16_inreg@rel32@hi+12 @@ -10587,9 +11460,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 3 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10609,13 +11484,14 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 @@ -10625,9 +11501,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10641,11 +11518,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[34:35], 0x0 ; GFX10-NEXT: s_getpc_b64 s[34:35] @@ -10658,9 +11536,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10674,11 +11554,13 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_load_b64 s[4:5], s[0:1], 0x0 ; GFX11-NEXT: s_getpc_b64 s[0:1] @@ -10692,9 +11574,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10708,11 +11592,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[4:5], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] @@ -10725,9 +11610,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10747,14 +11634,15 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 ; GFX9-NEXT: s_mov_b32 s4, 1 ; GFX9-NEXT: s_mov_b32 s5, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 @@ -10764,9 +11652,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10780,15 +11669,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s30, 2 @@ -10798,9 +11688,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10814,15 +11706,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s30, 2 @@ -10833,9 +11727,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10849,15 +11745,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 4 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v2i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v2i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 2 @@ -10867,9 +11764,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 4 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -10888,8 +11787,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 5 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -10898,6 +11797,7 @@ ; GFX9-NEXT: s_mov_b32 s4, 3 ; GFX9-NEXT: s_mov_b32 s5, 4 ; GFX9-NEXT: s_mov_b32 s6, 5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 4 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 @@ -10908,9 +11808,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 5 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -10924,15 +11825,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 5 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 3 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -10945,9 +11847,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 5 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -10961,15 +11865,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 5 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 3 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 4 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -10983,9 +11889,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 5 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -10999,15 +11907,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 5 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -11020,9 +11929,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 5 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11041,8 +11952,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11053,6 +11964,7 @@ ; GFX9-NEXT: s_mov_b32 s5, 4 ; GFX9-NEXT: s_mov_b32 s6, 5 ; GFX9-NEXT: s_mov_b32 s7, 6 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 @@ -11064,9 +11976,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11080,15 +11993,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 3 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 3 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 4 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -11104,9 +12018,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11120,15 +12036,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 3 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 3 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 4 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -11145,9 +12063,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11161,15 +12081,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v3i32_i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v3i32_i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 3 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 4 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -11185,9 +12106,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11206,8 +12129,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11215,6 +12138,7 @@ ; GFX9-NEXT: s_load_dwordx4 s[4:7], s[34:35], 0x0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 @@ -11226,9 +12150,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11242,11 +12167,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -11263,9 +12189,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11279,11 +12207,13 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -11301,9 +12231,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11317,11 +12249,12 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -11338,9 +12271,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11360,8 +12295,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 6 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11372,6 +12307,7 @@ ; GFX9-NEXT: s_mov_b32 s5, 2 ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 5 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 @@ -11383,9 +12319,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 6 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11399,15 +12336,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 6 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -11423,9 +12361,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 6 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11439,15 +12379,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 6 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -11464,9 +12406,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 6 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11480,15 +12424,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 6 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v4i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v4i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -11504,9 +12449,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 6 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11525,8 +12472,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 7 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11539,6 +12486,7 @@ ; GFX9-NEXT: s_mov_b32 s6, 3 ; GFX9-NEXT: s_mov_b32 s7, 4 ; GFX9-NEXT: s_mov_b32 s8, 5 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 6 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 @@ -11551,9 +12499,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 7 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11567,15 +12516,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 7 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -11594,9 +12544,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 7 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11610,15 +12562,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 7 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -11638,9 +12592,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 7 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11654,15 +12610,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 7 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v5i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v5i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -11681,9 +12638,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 7 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11702,10 +12661,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s7, 3 @@ -11732,9 +12692,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 10 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11748,12 +12709,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 10 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -11779,9 +12741,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 10 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -11795,12 +12759,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 10 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -11827,9 +12793,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 10 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -11843,12 +12811,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -11874,9 +12843,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -11897,8 +12868,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 10 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -11917,6 +12888,7 @@ ; GFX9-NEXT: s_mov_b32 s9, 6 ; GFX9-NEXT: s_mov_b32 s10, 7 ; GFX9-NEXT: s_mov_b32 s11, 8 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 9 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 @@ -11932,9 +12904,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 10 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -11948,15 +12921,16 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 10 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, 1 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_mov_b32 s5, 2 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 @@ -11984,9 +12958,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 10 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -12000,15 +12976,17 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 10 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, 1 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_mov_b32 s5, 2 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 @@ -12037,9 +13015,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 10 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -12053,15 +13033,16 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 10 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_v8i32_inreg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, external_void_func_v8i32_inreg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-SCRATCH-NEXT: s_mov_b32 s4, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: s_mov_b32 s5, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 @@ -12089,9 +13070,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 10 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -12110,8 +13093,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 18 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12122,6 +13105,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s11, 7 ; GFX9-NEXT: v_writelane_b32 v40, s12, 8 ; GFX9-NEXT: v_writelane_b32 v40, s13, 9 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s14, 10 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 @@ -12156,9 +13140,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 18 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -12172,12 +13157,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 18 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -12219,9 +13205,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 18 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -12235,12 +13223,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 18 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -12283,9 +13273,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 18 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -12299,12 +13291,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 18 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -12346,9 +13339,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 18 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -12369,8 +13364,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 28 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12386,6 +13381,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s20, 16 @@ -12459,9 +13455,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 28 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -12475,12 +13472,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 28 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -12567,9 +13565,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 28 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -12583,14 +13583,16 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 28 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_add_i32 s2, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -12671,9 +13673,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 28 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -12687,13 +13691,14 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 ; GFX10-SCRATCH-NEXT: s_add_i32 s2, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -12776,9 +13781,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 28 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -12799,8 +13806,8 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 28 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: v_writelane_b32 v40, s6, 2 @@ -12815,6 +13822,7 @@ ; GFX9-NEXT: v_writelane_b32 v40, s15, 11 ; GFX9-NEXT: v_writelane_b32 v40, s16, 12 ; GFX9-NEXT: v_writelane_b32 v40, s17, 13 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s18, 14 ; GFX9-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX9-NEXT: v_writelane_b32 v40, s19, 15 @@ -12894,9 +13902,10 @@ ; GFX9-NEXT: v_readlane_b32 s6, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 28 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -12910,12 +13919,13 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 28 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_load_dwordx2 s[34:35], s[34:35], 0x0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-NEXT: v_writelane_b32 v40, s7, 3 @@ -13007,9 +14017,11 @@ ; GFX10-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 28 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -13023,12 +14035,14 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 28 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: v_writelane_b32 v40, s6, 2 ; GFX11-NEXT: v_writelane_b32 v40, s7, 3 @@ -13114,9 +14128,11 @@ ; GFX11-NEXT: v_readlane_b32 s6, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 28 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -13130,12 +14146,13 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 28 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_load_dwordx2 s[0:1], s[0:1], 0x0 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s6, 2 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s7, 3 @@ -13225,9 +14242,11 @@ ; GFX10-SCRATCH-NEXT: v_readlane_b32 s6, v40, 2 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 28 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -13249,12 +14268,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 -; GFX9-NEXT: s_addk_i32 s32, 0x400 +; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 @@ -13266,11 +14286,12 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: s_addk_i32 s32, 0xfc00 +; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s34 ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] @@ -13282,13 +14303,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_clause 0x1 ; GFX10-NEXT: buffer_load_dword v32, off, s[0:3], s33 ; GFX10-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 -; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x400 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, stack_passed_f64_arg@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, stack_passed_f64_arg@rel32@hi+12 @@ -13296,17 +14319,18 @@ ; GFX10-NEXT: buffer_store_dword v32, off, s[0:3], s32 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: buffer_store_dword v33, off, s[0:3], s32 offset:4 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:8 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: s_addk_i32 s32, 0xfe00 +; GFX10-NEXT: s_addk_i32 s32, 0xfc00 ; GFX10-NEXT: s_mov_b32 s33, s34 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] @@ -13317,26 +14341,30 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:8 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: scratch_load_b64 v[32:33], off, s33 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 -; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 32 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: scratch_store_b64 off, v[32:33], s32 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:8 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: s_add_i32 s32, s32, -16 +; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] @@ -13348,27 +14376,30 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 offset:8 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:12 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: scratch_load_dwordx2 v[32:33], off, s33 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 32 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, stack_passed_f64_arg@rel32@lo+4 ; GFX10-SCRATCH-NEXT: s_addc_u32 s1, s1, stack_passed_f64_arg@rel32@hi+12 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: scratch_store_dwordx2 off, v[32:33], s32 ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 offset:8 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:12 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 +; GFX10-SCRATCH-NEXT: s_addk_i32 s32, 0xffe0 ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s0 ; GFX10-SCRATCH-NEXT: s_waitcnt vmcnt(0) ; GFX10-SCRATCH-NEXT: s_setpc_b64 s[30:31] @@ -13385,6 +14416,7 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 12 @@ -13392,7 +14424,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:8 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -13429,6 +14460,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 9 ; GFX9-NEXT: v_mov_b32_e32 v30, 10 ; GFX9-NEXT: v_mov_b32_e32 v31, 11 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_12xv3i32@rel32@lo+4 @@ -13436,9 +14468,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -13452,9 +14485,9 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 ; GFX10-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-NEXT: v_mov_b32_e32 v2, 14 @@ -13497,6 +14530,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-NEXT: v_mov_b32_e32 v31, 11 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_12xv3i32@rel32@lo+4 @@ -13504,9 +14538,11 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -13520,9 +14556,10 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 12 :: v_dual_mov_b32 v1, 13 ; GFX11-NEXT: v_dual_mov_b32 v2, 14 :: v_dual_mov_b32 v3, 15 ; GFX11-NEXT: s_add_i32 s32, s32, 16 @@ -13544,6 +14581,7 @@ ; GFX11-NEXT: v_dual_mov_b32 v26, 8 :: v_dual_mov_b32 v27, 9 ; GFX11-NEXT: v_dual_mov_b32 v28, 9 :: v_dual_mov_b32 v29, 9 ; GFX11-NEXT: v_dual_mov_b32 v30, 10 :: v_dual_mov_b32 v31, 11 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 @@ -13552,9 +14590,11 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -13568,9 +14608,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 12 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 13 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 14 @@ -13610,6 +14650,7 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v29, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v30, 10 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v31, 11 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-SCRATCH-NEXT: s_getpc_b64 s[0:1] ; GFX10-SCRATCH-NEXT: s_add_u32 s0, s0, external_void_func_12xv3i32@rel32@lo+4 @@ -13617,9 +14658,11 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -13651,6 +14694,7 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 8 @@ -13666,7 +14710,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 13 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v0, 14 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 15 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -13703,6 +14746,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 5 ; GFX9-NEXT: v_mov_b32_e32 v30, 6 ; GFX9-NEXT: v_mov_b32_e32 v31, 7 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_8xv5i32@rel32@lo+4 @@ -13710,9 +14754,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -13726,20 +14771,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-NEXT: v_mov_b32_e32 v2, 10 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_mov_b32_e32 v3, 14 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v0, 11 ; GFX10-NEXT: v_mov_b32_e32 v1, 12 ; GFX10-NEXT: v_mov_b32_e32 v2, 13 -; GFX10-NEXT: v_mov_b32_e32 v3, 14 ; GFX10-NEXT: v_mov_b32_e32 v4, 15 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -13779,6 +14824,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 5 ; GFX10-NEXT: v_mov_b32_e32 v30, 6 ; GFX10-NEXT: v_mov_b32_e32 v31, 7 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_8xv5i32@rel32@lo+4 @@ -13786,9 +14832,11 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -13802,16 +14850,18 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_dual_mov_b32 v0, 8 :: v_dual_mov_b32 v1, 9 ; GFX11-NEXT: v_dual_mov_b32 v2, 10 :: v_dual_mov_b32 v3, 11 ; GFX11-NEXT: v_dual_mov_b32 v4, 12 :: v_dual_mov_b32 v5, 13 ; GFX11-NEXT: v_dual_mov_b32 v6, 14 :: v_dual_mov_b32 v7, 15 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 ; GFX11-NEXT: v_dual_mov_b32 v0, 0 :: v_dual_mov_b32 v1, 0 @@ -13838,9 +14888,11 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -13854,9 +14906,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 8 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 9 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 10 @@ -13866,8 +14918,9 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 14 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 15 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 @@ -13909,9 +14962,11 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 @@ -13939,6 +14994,7 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41000000 @@ -13954,7 +15010,6 @@ ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41500000 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41600000 -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; GFX9-NEXT: v_mov_b32_e32 v0, 0x41700000 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 @@ -13991,6 +15046,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX9-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX9-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, external_void_func_8xv5f32@rel32@lo+4 @@ -13998,9 +15054,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -14014,20 +15071,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41200000 ; GFX10-NEXT: s_addk_i32 s32, 0x200 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX10-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX10-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; GFX10-NEXT: v_mov_b32_e32 v0, 0x41300000 ; GFX10-NEXT: v_mov_b32_e32 v1, 0x41400000 ; GFX10-NEXT: v_mov_b32_e32 v2, 0x41500000 -; GFX10-NEXT: v_mov_b32_e32 v3, 0x41600000 ; GFX10-NEXT: v_mov_b32_e32 v4, 0x41700000 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:12 @@ -14067,6 +15124,7 @@ ; GFX10-NEXT: v_mov_b32_e32 v29, 0x40a00000 ; GFX10-NEXT: v_mov_b32_e32 v30, 0x40c00000 ; GFX10-NEXT: v_mov_b32_e32 v31, 0x40e00000 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_8xv5f32@rel32@lo+4 @@ -14074,9 +15132,11 @@ ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -14090,9 +15150,10 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 ; GFX11-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX11-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX11-NEXT: v_mov_b32_e32 v2, 0x41200000 @@ -14102,8 +15163,9 @@ ; GFX11-NEXT: v_mov_b32_e32 v6, 0x41600000 ; GFX11-NEXT: v_mov_b32_e32 v7, 0x41700000 ; GFX11-NEXT: s_add_i32 s32, s32, 16 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: scratch_store_b128 off, v[0:3], s32 ; GFX11-NEXT: scratch_store_b128 off, v[4:7], s0 ; GFX11-NEXT: v_mov_b32_e32 v6, 1.0 @@ -14132,9 +15194,11 @@ ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -14148,9 +15212,9 @@ ; GFX10-SCRATCH-NEXT: s_mov_b32 s33, s32 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 ; GFX10-SCRATCH-NEXT: scratch_store_dword off, v40, s33 ; 4-byte Folded Spill +; GFX10-SCRATCH-NEXT: scratch_store_dword off, v41, s33 offset:4 ; 4-byte Folded Spill ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s0, 2 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0x41000000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v1, 0x41100000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v2, 0x41200000 @@ -14160,8 +15224,9 @@ ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v6, 0x41600000 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v7, 0x41700000 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, 16 -; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v41, s0, 0 ; GFX10-SCRATCH-NEXT: s_add_i32 s0, s32, 16 +; GFX10-SCRATCH-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[0:3], s32 ; GFX10-SCRATCH-NEXT: scratch_store_dwordx4 off, v[4:7], s0 ; GFX10-SCRATCH-NEXT: v_mov_b32_e32 v0, 0 @@ -14203,9 +15268,11 @@ ; GFX10-SCRATCH-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX10-SCRATCH-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-SCRATCH-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v40, 2 +; GFX10-SCRATCH-NEXT: v_readlane_b32 s0, v41, 0 ; GFX10-SCRATCH-NEXT: s_or_saveexec_b32 s1, -1 -; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 ; 4-byte Folded Reload +; GFX10-SCRATCH-NEXT: s_clause 0x1 +; GFX10-SCRATCH-NEXT: scratch_load_dword v40, off, s33 +; GFX10-SCRATCH-NEXT: scratch_load_dword v41, off, s33 offset:4 ; GFX10-SCRATCH-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-SCRATCH-NEXT: s_mov_b32 exec_lo, s1 ; GFX10-SCRATCH-NEXT: s_add_i32 s32, s32, -16 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-preserved-registers.ll @@ -13,12 +13,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 4 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s5, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 2 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 3 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -31,9 +32,10 @@ ; GFX9-NEXT: v_readlane_b32 s30, v40, 2 ; GFX9-NEXT: v_readlane_b32 s5, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 4 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -47,11 +49,12 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 4 -; GFX10-NEXT: s_addk_i32 s32, 0x200 ; GFX10-NEXT: v_writelane_b32 v40, s4, 0 +; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: v_writelane_b32 v40, s5, 1 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -66,9 +69,11 @@ ; GFX10-NEXT: v_readlane_b32 s30, v40, 2 ; GFX10-NEXT: v_readlane_b32 s5, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 4 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -82,11 +87,13 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 4 -; GFX11-NEXT: s_add_i32 s32, s32, 16 ; GFX11-NEXT: v_writelane_b32 v40, s4, 0 +; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: v_writelane_b32 v40, s5, 1 ; GFX11-NEXT: s_getpc_b64 s[4:5] ; GFX11-NEXT: s_add_u32 s4, s4, external_void_func_void@rel32@lo+4 @@ -102,9 +109,11 @@ ; GFX11-NEXT: v_readlane_b32 s30, v40, 2 ; GFX11-NEXT: v_readlane_b32 s5, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 4 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -174,9 +183,9 @@ ; GFX11-LABEL: void_func_void_clobber_s28_s29: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v0, s28, 0 ; GFX11-NEXT: v_writelane_b32 v0, s29, 1 ; GFX11-NEXT: v_writelane_b32 v0, s30, 2 @@ -192,9 +201,9 @@ ; GFX11-NEXT: v_readlane_b32 s30, v0, 2 ; GFX11-NEXT: v_readlane_b32 s29, v0, 1 ; GFX11-NEXT: v_readlane_b32 s28, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s[30:31]}"() #0 @@ -210,11 +219,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s31 @@ -231,9 +241,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -247,14 +258,15 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: ;;#ASMSTART @@ -269,9 +281,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -285,14 +299,16 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: ;;#ASMSTART @@ -307,9 +323,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -329,10 +347,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v42, s34, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -350,9 +369,10 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -366,20 +386,21 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v31 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_mov_b32_e32 v41, v31 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_mov_b32_e32 v31, v41 ; GFX10-NEXT: ;;#ASMSTART @@ -388,9 +409,11 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -404,20 +427,23 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v31 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: v_mov_b32_e32 v41, v31 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: v_mov_b32_e32 v31, v41 ; GFX11-NEXT: ;;#ASMSTART @@ -426,9 +452,11 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v42, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -449,11 +477,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s33 @@ -470,9 +499,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -486,19 +516,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s33 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: s_mov_b32 s4, s33 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: s_mov_b32 s33, s4 @@ -508,9 +539,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -524,19 +557,21 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s33 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, s33 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: s_mov_b32 s4, s33 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s33, s4 @@ -547,9 +582,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -569,9 +606,10 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -590,9 +628,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -606,19 +645,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[36:37] ; GFX10-NEXT: s_add_u32 s36, s36, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s37, s37, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s34 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, s34 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: s_mov_b32 s4, s34 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[36:37] ; GFX10-NEXT: s_mov_b32 s34, s4 @@ -628,9 +668,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -644,19 +686,21 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s34 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, s34 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: s_mov_b32 s4, s34 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s34, s4 @@ -667,9 +711,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -689,10 +735,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v41, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v41, s30, 0 +; GFX9-NEXT: v_writelane_b32 v42, s34, 0 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v41, s31, 1 ; GFX9-NEXT: ;;#ASMSTART @@ -708,9 +755,10 @@ ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v41, 1 ; GFX9-NEXT: v_readlane_b32 s30, v41, 0 -; GFX9-NEXT: v_readlane_b32 s34, v41, 2 +; GFX9-NEXT: v_readlane_b32 s34, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -724,19 +772,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v41, s34, 2 +; GFX10-NEXT: v_writelane_b32 v41, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v41, s30, 0 -; GFX10-NEXT: v_writelane_b32 v41, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; use v40 @@ -744,9 +793,11 @@ ; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GFX10-NEXT: v_readlane_b32 s31, v41, 1 ; GFX10-NEXT: v_readlane_b32 s30, v41, 0 -; GFX10-NEXT: v_readlane_b32 s34, v41, 2 +; GFX10-NEXT: v_readlane_b32 s34, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -760,19 +811,22 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v41, s0, 2 +; GFX11-NEXT: v_writelane_b32 v41, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v40 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v41, s31, 1 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v41, s30, 0 -; GFX11-NEXT: v_writelane_b32 v41, s31, 1 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; use v40 @@ -780,9 +834,11 @@ ; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: v_readlane_b32 s31, v41, 1 ; GFX11-NEXT: v_readlane_b32 s30, v41, 0 -; GFX11-NEXT: v_readlane_b32 s0, v41, 2 +; GFX11-NEXT: v_readlane_b32 s0, v42, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -798,54 +854,54 @@ ; GFX9-LABEL: void_func_void_clobber_s33: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_writelane_b32 v0, s33, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s33, v0, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: void_func_void_clobber_s33: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: v_writelane_b32 v0, s33, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s33, v0, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_void_clobber_s33: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v0, s33, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s33, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s33}"() #0 @@ -856,54 +912,54 @@ ; GFX9-LABEL: void_func_void_clobber_s34: ; GFX9: ; %bb.0: ; GFX9-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_writelane_b32 v0, s34, 0 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; clobber ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: v_readlane_b32 s34, v0, 0 -; GFX9-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GFX9-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-NEXT: s_mov_b64 exec, s[4:5] +; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_waitcnt vmcnt(0) ; GFX9-NEXT: s_setpc_b64 s[30:31] ; ; GFX10-LABEL: void_func_void_clobber_s34: ; GFX10: ; %bb.0: ; GFX10-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: v_writelane_b32 v0, s34, 0 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; clobber ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: v_readlane_b32 s34, v0, 0 -; GFX10-NEXT: s_xor_saveexec_b32 s4, -1 +; GFX10-NEXT: s_xor_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 -; GFX10-NEXT: s_mov_b32 exec_lo, s4 +; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: s_setpc_b64 s[30:31] ; ; GFX11-LABEL: void_func_void_clobber_s34: ; GFX11: ; %bb.0: ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_store_b32 off, v0, s32 ; 4-byte Folded Spill -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_writelane_b32 v0, s34, 0 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; clobber ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s34, v0, 0 -; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 +; GFX11-NEXT: s_xor_saveexec_b32 s1, -1 ; GFX11-NEXT: scratch_load_b32 v0, off, s32 ; 4-byte Folded Reload -; GFX11-NEXT: s_mov_b32 exec_lo, s0 +; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "; clobber", "~{s34}"() #0 @@ -918,10 +974,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 @@ -929,9 +986,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -945,21 +1003,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s33@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s33@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -973,22 +1034,26 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s33@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s33@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1006,10 +1071,11 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 2 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_getpc_b64 s[34:35] ; GFX9-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 @@ -1017,9 +1083,10 @@ ; GFX9-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 2 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1033,21 +1100,24 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 2 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, void_func_void_clobber_s34@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, void_func_void_clobber_s34@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 2 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1061,22 +1131,26 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, void_func_void_clobber_s34@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, void_func_void_clobber_s34@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1094,11 +1168,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v41, s34, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART ; GFX9-NEXT: ; def s40 @@ -1114,9 +1189,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v41, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1130,19 +1206,20 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v41, s34, 0 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART @@ -1151,9 +1228,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v41, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1167,19 +1246,21 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 +; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v41, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART @@ -1189,9 +1270,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v41, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 +; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:4 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -1211,11 +1294,12 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[36:37] -; GFX9-NEXT: v_writelane_b32 v40, s34, 3 ; GFX9-NEXT: v_writelane_b32 v40, s4, 0 ; GFX9-NEXT: s_addk_i32 s32, 0x400 ; GFX9-NEXT: v_writelane_b32 v40, s30, 1 +; GFX9-NEXT: v_writelane_b32 v42, s34, 0 ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX9-NEXT: v_writelane_b32 v40, s31, 2 ; GFX9-NEXT: ;;#ASMSTART @@ -1240,9 +1324,10 @@ ; GFX9-NEXT: v_readlane_b32 s31, v40, 2 ; GFX9-NEXT: v_readlane_b32 s30, v40, 1 ; GFX9-NEXT: v_readlane_b32 s4, v40, 0 -; GFX9-NEXT: v_readlane_b32 s34, v40, 3 +; GFX9-NEXT: v_readlane_b32 s34, v42, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[36:37], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-NEXT: s_addk_i32 s32, 0xfc00 ; GFX9-NEXT: s_mov_b32 s33, s34 @@ -1256,24 +1341,25 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 -; GFX10-NEXT: v_writelane_b32 v40, s34, 3 +; GFX10-NEXT: v_writelane_b32 v40, s4, 0 ; GFX10-NEXT: s_addk_i32 s32, 0x200 +; GFX10-NEXT: v_writelane_b32 v42, s34, 0 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def s40 ; GFX10-NEXT: ;;#ASMEND +; GFX10-NEXT: v_writelane_b32 v40, s30, 1 +; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: ;;#ASMSTART ; GFX10-NEXT: ; def v32 ; GFX10-NEXT: ;;#ASMEND -; GFX10-NEXT: v_writelane_b32 v40, s4, 0 -; GFX10-NEXT: s_mov_b32 s4, s40 ; GFX10-NEXT: v_mov_b32_e32 v41, v32 ; GFX10-NEXT: s_getpc_b64 s[34:35] ; GFX10-NEXT: s_add_u32 s34, s34, external_void_func_void@rel32@lo+4 ; GFX10-NEXT: s_addc_u32 s35, s35, external_void_func_void@rel32@hi+12 -; GFX10-NEXT: v_writelane_b32 v40, s30, 1 ; GFX10-NEXT: v_writelane_b32 v40, s31, 2 ; GFX10-NEXT: s_swappc_b64 s[30:31], s[34:35] ; GFX10-NEXT: ;;#ASMSTART @@ -1286,9 +1372,11 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 2 ; GFX10-NEXT: v_readlane_b32 s30, v40, 1 ; GFX10-NEXT: v_readlane_b32 s4, v40, 0 -; GFX10-NEXT: v_readlane_b32 s34, v40, 3 +; GFX10-NEXT: v_readlane_b32 s34, v42, 0 ; GFX10-NEXT: s_or_saveexec_b32 s35, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 +; GFX10-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s35 ; GFX10-NEXT: s_addk_i32 s32, 0xfe00 @@ -1302,24 +1390,26 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:4 +; GFX11-NEXT: scratch_store_b32 off, v42, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 -; GFX11-NEXT: v_writelane_b32 v40, s0, 3 +; GFX11-NEXT: v_writelane_b32 v40, s4, 0 ; GFX11-NEXT: s_add_i32 s32, s32, 16 +; GFX11-NEXT: v_writelane_b32 v42, s0, 0 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 ; 4-byte Folded Spill ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def s40 ; GFX11-NEXT: ;;#ASMEND +; GFX11-NEXT: v_writelane_b32 v40, s30, 1 +; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: ;;#ASMSTART ; GFX11-NEXT: ; def v32 ; GFX11-NEXT: ;;#ASMEND -; GFX11-NEXT: v_writelane_b32 v40, s4, 0 -; GFX11-NEXT: s_mov_b32 s4, s40 ; GFX11-NEXT: v_mov_b32_e32 v41, v32 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, external_void_func_void@rel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, external_void_func_void@rel32@hi+12 -; GFX11-NEXT: v_writelane_b32 v40, s30, 1 ; GFX11-NEXT: v_writelane_b32 v40, s31, 2 ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: ;;#ASMSTART @@ -1332,9 +1422,11 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 2 ; GFX11-NEXT: v_readlane_b32 s30, v40, 1 ; GFX11-NEXT: v_readlane_b32 s4, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 3 +; GFX11-NEXT: v_readlane_b32 s0, v42, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:4 +; GFX11-NEXT: scratch_load_b32 v42, off, s33 offset:8 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_add_i32 s32, s32, -16 ; GFX11-NEXT: s_mov_b32 s33, s0 diff --git a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll --- a/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll +++ b/llvm/test/CodeGen/AMDGPU/gfx-callable-return-types.ll @@ -79,7 +79,7 @@ ; GFX11-LABEL: call_i1: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -100,7 +100,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -184,7 +184,7 @@ ; GFX11-LABEL: call_i16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -205,7 +205,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -289,7 +289,7 @@ ; GFX11-LABEL: call_2xi16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v1, s33 ; 4-byte Folded Spill @@ -310,7 +310,7 @@ ; GFX11-NEXT: scratch_load_b32 v1, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -402,7 +402,7 @@ ; GFX11-LABEL: call_3xi16: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v2, s33 ; 4-byte Folded Spill @@ -423,7 +423,7 @@ ; GFX11-NEXT: scratch_load_b32 v2, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -890,7 +890,7 @@ ; GFX11-LABEL: call_100xi32: ; GFX11: ; %bb.0: ; %entry ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v100, s33 offset:128 ; 4-byte Folded Spill @@ -976,7 +976,7 @@ ; GFX11-NEXT: scratch_load_b32 v100, off, s33 offset:128 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xff70 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] entry: @@ -2835,7 +2835,7 @@ ; GFX9-NEXT: s_add_i32 s33, s32, 0x7fc0 ; GFX9-NEXT: s_and_b32 s33, s33, 0xffff8000 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0x28000 ; GFX9-NEXT: s_getpc_b64 s[34:35] @@ -3001,28 +3001,28 @@ ; GFX9-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:600 ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:604 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:608 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:612 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:616 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:620 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:624 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:628 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:632 ; GFX9-NEXT: s_waitcnt vmcnt(0) -; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; GFX9-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; GFX9-NEXT: buffer_store_dword v32, off, s[0:3], s32 offset:8 @@ -3072,14 +3072,14 @@ ; GFX9-NEXT: v_mov_b32_e32 v6, v28 ; GFX9-NEXT: v_mov_b32_e32 v7, v29 ; GFX9-NEXT: v_mov_b32_e32 v8, v30 -; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload -; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:1564 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:1560 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:1556 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:1552 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:1548 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:1544 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:1540 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload ; GFX9-NEXT: v_lshrrev_b32_e64 v0, 6, s33 ; GFX9-NEXT: v_add_u32_e32 v0, 0x400, v0 ; GFX9-NEXT: v_mov_b32_e32 v1, 42 @@ -3103,7 +3103,7 @@ ; GFX9-NEXT: v_readlane_b32 s31, v33, 1 ; GFX9-NEXT: v_readlane_b32 s30, v33, 0 ; GFX9-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-NEXT: s_add_i32 s32, s32, 0xfffd8000 ; GFX9-NEXT: s_mov_b32 s33, s36 @@ -3117,7 +3117,7 @@ ; GFX10-NEXT: s_add_i32 s33, s32, 0x3fe0 ; GFX10-NEXT: s_and_b32 s33, s33, 0xffffc000 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_add_i32 s32, s32, 0x14000 @@ -3262,28 +3262,28 @@ ; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:796 ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:516 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1536 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:520 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1540 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:524 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1544 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:528 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1548 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:532 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1552 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:536 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1556 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:540 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1560 ; 4-byte Folded Spill ; GFX10-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:544 ; GFX10-NEXT: s_waitcnt vmcnt(0) -; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1568 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:1564 ; 4-byte Folded Spill ; GFX10-NEXT: s_clause 0x15 ; GFX10-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:548 ; GFX10-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:552 @@ -3350,14 +3350,14 @@ ; GFX10-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:156 ; GFX10-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:160 ; GFX10-NEXT: s_clause 0x7 -; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1540 -; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1544 -; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1548 -; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1552 -; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1556 -; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1560 -; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1564 -; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:1568 +; GFX10-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:1536 +; GFX10-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:1540 +; GFX10-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:1544 +; GFX10-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:1548 +; GFX10-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:1552 +; GFX10-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:1556 +; GFX10-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:1560 +; GFX10-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:1564 ; GFX10-NEXT: v_lshrrev_b32_e64 v0, 5, s33 ; GFX10-NEXT: v_mov_b32_e32 v1, 42 ; GFX10-NEXT: v_add_nc_u32_e32 v0, 0x400, v0 @@ -3381,7 +3381,7 @@ ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 ; GFX10-NEXT: s_or_saveexec_b32 s34, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1536 ; 4-byte Folded Reload +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:1568 ; 4-byte Folded Reload ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s34 ; GFX10-NEXT: s_add_i32 s32, s32, 0xfffec000 @@ -3397,7 +3397,7 @@ ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_and_b32 s33, s33, 0xfffffe00 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:1536 ; 4-byte Folded Spill +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:1600 ; 4-byte Folded Spill ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_mov_b32 s0, 0 ; GFX11-NEXT: v_mov_b32_e32 v4, 0 @@ -3495,7 +3495,7 @@ ; GFX11-NEXT: s_waitcnt vmcnt(4) ; GFX11-NEXT: v_dual_mov_b32 v44, v62 :: v_dual_mov_b32 v57, v16 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[12:15], s33 offset:1588 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[12:15], s33 offset:1584 ; 16-byte Folded Spill ; GFX11-NEXT: s_clause 0x3 ; GFX11-NEXT: scratch_load_b128 v[12:15], off, s33 offset:528 ; GFX11-NEXT: scratch_load_b128 v[20:23], off, s33 offset:544 @@ -3511,13 +3511,13 @@ ; GFX11-NEXT: s_waitcnt vmcnt(1) ; GFX11-NEXT: v_dual_mov_b32 v10, v21 :: v_dual_mov_b32 v15, v26 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1572 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1568 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:592 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1556 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1552 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_load_b128 v[28:31], off, s33 offset:608 ; GFX11-NEXT: s_waitcnt vmcnt(0) -; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1540 ; 16-byte Folded Spill +; GFX11-NEXT: scratch_store_b128 off, v[28:31], s33 offset:1536 ; 16-byte Folded Spill ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s32 ; GFX11-NEXT: v_mov_b32_e32 v32, v36 ; GFX11-NEXT: v_dual_mov_b32 v33, v48 :: v_dual_mov_b32 v34, v49 @@ -3560,13 +3560,13 @@ ; GFX11-NEXT: scratch_store_b128 off, v[36:39], s0 ; GFX11-NEXT: s_add_i32 s0, s32, 16 ; GFX11-NEXT: scratch_store_b128 off, v[32:35], s0 -; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1588 ; 16-byte Folded Reload +; GFX11-NEXT: scratch_load_b128 v[1:4], off, s33 offset:1584 ; 16-byte Folded Reload ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: v_mov_b32_e32 v1, 42 ; GFX11-NEXT: s_clause 0x2 -; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1572 -; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1556 -; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1540 +; GFX11-NEXT: scratch_load_b128 v[17:20], off, s33 offset:1568 +; GFX11-NEXT: scratch_load_b128 v[21:24], off, s33 offset:1552 +; GFX11-NEXT: scratch_load_b128 v[25:28], off, s33 offset:1536 ; GFX11-NEXT: s_add_i32 s0, s33, 0x400 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: v_mov_b32_e32 v0, s0 @@ -3590,7 +3590,7 @@ ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 ; GFX11-NEXT: s_or_saveexec_b32 s0, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:1536 ; 4-byte Folded Reload +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:1600 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_addk_i32 s32, 0xf600 ; GFX11-NEXT: s_mov_b32 s33, s45 diff --git a/llvm/test/CodeGen/AMDGPU/indirect-call.ll b/llvm/test/CodeGen/AMDGPU/indirect-call.ll --- a/llvm/test/CodeGen/AMDGPU/indirect-call.ll +++ b/llvm/test/CodeGen/AMDGPU/indirect-call.ll @@ -397,8 +397,9 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 18 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -465,9 +466,10 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 18 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -481,8 +483,9 @@ ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 18 +; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -549,9 +552,10 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 18 +; GISEL-NEXT: v_readlane_b32 s4, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 @@ -569,8 +573,9 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 18 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -640,9 +645,10 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 18 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -656,8 +662,9 @@ ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 18 +; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -725,9 +732,10 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 18 +; GISEL-NEXT: v_readlane_b32 s4, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 @@ -745,8 +753,9 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 18 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -815,9 +824,10 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 18 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -831,8 +841,9 @@ ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 18 +; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -901,9 +912,10 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 18 +; GISEL-NEXT: v_readlane_b32 s4, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 @@ -922,8 +934,9 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 20 +; GCN-NEXT: v_writelane_b32 v41, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: v_writelane_b32 v40, s30, 0 ; GCN-NEXT: v_writelane_b32 v40, s31, 1 @@ -1001,9 +1014,10 @@ ; GCN-NEXT: v_readlane_b32 s34, v40, 2 ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 s4, v40, 20 +; GCN-NEXT: v_readlane_b32 s4, v41, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -1017,8 +1031,9 @@ ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GISEL-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GISEL-NEXT: s_mov_b64 exec, s[18:19] -; GISEL-NEXT: v_writelane_b32 v40, s16, 20 +; GISEL-NEXT: v_writelane_b32 v41, s16, 0 ; GISEL-NEXT: s_addk_i32 s32, 0x400 ; GISEL-NEXT: v_writelane_b32 v40, s30, 0 ; GISEL-NEXT: v_writelane_b32 v40, s31, 1 @@ -1096,9 +1111,10 @@ ; GISEL-NEXT: v_readlane_b32 s34, v40, 2 ; GISEL-NEXT: v_readlane_b32 s31, v40, 1 ; GISEL-NEXT: v_readlane_b32 s30, v40, 0 -; GISEL-NEXT: v_readlane_b32 s4, v40, 20 +; GISEL-NEXT: v_readlane_b32 s4, v41, 0 ; GISEL-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GISEL-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[6:7] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 ; GISEL-NEXT: s_mov_b32 s33, s4 @@ -1311,7 +1327,7 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s12, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1402,14 +1418,14 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s12 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_reuse: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s12, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -1500,7 +1516,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] call amdgpu_gfx void %fptr(i32 %i) @@ -1515,7 +1531,7 @@ ; GCN-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s12, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1604,14 +1620,14 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s12 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_call_vgpr_ptr_arg_and_return: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s12, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1700,7 +1716,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] %ret = call amdgpu_gfx i32 %fptr(i32 %i) @@ -1712,7 +1728,7 @@ ; GCN-LABEL: test_indirect_tail_call_vgpr_ptr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s10, s33 +; GCN-NEXT: s_mov_b32 s12, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1798,14 +1814,14 @@ ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 -; GCN-NEXT: s_mov_b32 s33, s10 +; GCN-NEXT: s_mov_b32 s33, s12 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] ; ; GISEL-LABEL: test_indirect_tail_call_vgpr_ptr: ; GISEL: ; %bb.0: ; GISEL-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GISEL-NEXT: s_mov_b32 s10, s33 +; GISEL-NEXT: s_mov_b32 s12, s33 ; GISEL-NEXT: s_mov_b32 s33, s32 ; GISEL-NEXT: s_or_saveexec_b64 s[4:5], -1 ; GISEL-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill @@ -1891,7 +1907,7 @@ ; GISEL-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload ; GISEL-NEXT: s_mov_b64 exec, s[4:5] ; GISEL-NEXT: s_addk_i32 s32, 0xfc00 -; GISEL-NEXT: s_mov_b32 s33, s10 +; GISEL-NEXT: s_mov_b32 s33, s12 ; GISEL-NEXT: s_waitcnt vmcnt(0) ; GISEL-NEXT: s_setpc_b64 s[30:31] tail call amdgpu_gfx void %fptr() diff --git a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll --- a/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll +++ b/llvm/test/CodeGen/AMDGPU/insert-delay-alu-bug.ll @@ -7,7 +7,7 @@ ; GFX11-LABEL: f0: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX11-NEXT: s_mov_b32 s2, s33 +; GFX11-NEXT: s_mov_b32 s3, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_xor_saveexec_b32 s0, -1 ; GFX11-NEXT: scratch_store_b32 off, v4, s33 ; 4-byte Folded Spill @@ -28,7 +28,7 @@ ; GFX11-NEXT: scratch_load_b32 v4, off, s33 ; 4-byte Folded Reload ; GFX11-NEXT: s_mov_b32 exec_lo, s0 ; GFX11-NEXT: s_add_i32 s32, s32, -16 -; GFX11-NEXT: s_mov_b32 s33, s2 +; GFX11-NEXT: s_mov_b32 s33, s3 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: s_setpc_b64 s[30:31] bb: @@ -53,20 +53,20 @@ ; GFX11-LABEL: f2: ; GFX11: ; %bb.0: ; %bb ; GFX11-NEXT: s_mov_b64 s[16:17], s[4:5] +; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] +; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] +; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x24 ; GFX11-NEXT: v_mov_b32_e32 v31, v0 -; GFX11-NEXT: s_load_b32 s24, s[16:17], 0x24 ; GFX11-NEXT: s_mov_b32 s18, s14 ; GFX11-NEXT: s_mov_b32 s12, s13 -; GFX11-NEXT: s_mov_b64 s[10:11], s[6:7] -; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 -; GFX11-NEXT: s_mov_b64 s[6:7], s[2:3] ; GFX11-NEXT: s_mov_b64 s[4:5], s[0:1] -; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_mov_b32 s20, 0 +; GFX11-NEXT: v_and_b32_e32 v0, 0x3ff, v31 ; GFX11-NEXT: s_mov_b32 s0, -1 ; GFX11-NEXT: s_mov_b32 s19, exec_lo ; GFX11-NEXT: s_mov_b32 s32, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: v_mul_lo_u32 v0, s24, v0 +; GFX11-NEXT: v_mul_lo_u32 v0, s2, v0 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_1) ; GFX11-NEXT: v_cmpx_eq_u32_e32 0, v0 ; GFX11-NEXT: s_cbranch_execz .LBB2_13 @@ -74,7 +74,7 @@ ; GFX11-NEXT: s_load_b128 s[20:23], s[16:17], 0x2c ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_bitcmp1_b32 s21, 0 -; GFX11-NEXT: s_cselect_b32 s25, -1, 0 +; GFX11-NEXT: s_cselect_b32 s24, -1, 0 ; GFX11-NEXT: s_bitcmp0_b32 s21, 0 ; GFX11-NEXT: s_mov_b32 s21, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_3 @@ -90,41 +90,40 @@ ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] ; GFX11-NEXT: s_mov_b32 s1, -1 -; GFX11-NEXT: s_and_not1_b32 vcc_lo, exec_lo, s3 -; GFX11-NEXT: s_cbranch_vccz .LBB2_4 +; GFX11-NEXT: s_cbranch_execz .LBB2_4 ; GFX11-NEXT: s_branch .LBB2_12 ; GFX11-NEXT: .LBB2_3: ; GFX11-NEXT: s_mov_b32 s1, 0 ; GFX11-NEXT: .LBB2_4: ; %bb16 -; GFX11-NEXT: s_load_b32 s2, s[16:17], 0x54 +; GFX11-NEXT: s_load_b32 s3, s[16:17], 0x54 ; GFX11-NEXT: s_bitcmp1_b32 s23, 0 ; GFX11-NEXT: s_cselect_b32 s0, -1, 0 -; GFX11-NEXT: s_and_b32 s3, s23, 1 +; GFX11-NEXT: s_and_b32 s9, s23, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_bitcmp1_b32 s2, 0 -; GFX11-NEXT: s_mov_b32 s2, -1 +; GFX11-NEXT: s_bitcmp1_b32 s3, 0 +; GFX11-NEXT: s_mov_b32 s3, -1 ; GFX11-NEXT: s_cselect_b32 s8, -1, 0 -; GFX11-NEXT: s_cmp_eq_u32 s3, 0 +; GFX11-NEXT: s_cmp_eq_u32 s9, 0 ; GFX11-NEXT: s_cbranch_scc0 .LBB2_8 ; GFX11-NEXT: ; %bb.5: ; %bb18.preheader ; GFX11-NEXT: s_load_b128 s[28:31], s[16:17], 0x44 ; GFX11-NEXT: v_mov_b32_e32 v1, 0 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) -; GFX11-NEXT: s_mul_hi_u32 s2, s29, s28 -; GFX11-NEXT: s_mul_i32 s3, s29, s28 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_1) | instid1(VALU_DEP_1) -; GFX11-NEXT: v_alignbit_b32 v0, s2, s3, 1 -; GFX11-NEXT: s_mov_b32 s3, 0 -; GFX11-NEXT: v_readfirstlane_b32 s2, v0 -; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s25 +; GFX11-NEXT: s_mul_hi_u32 s3, s29, s28 +; GFX11-NEXT: s_mul_i32 s9, s29, s28 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(VALU_DEP_1) +; GFX11-NEXT: v_alignbit_b32 v0, s3, s9, 1 +; GFX11-NEXT: v_readfirstlane_b32 s3, v0 +; GFX11-NEXT: v_cndmask_b32_e64 v0, 0, 1, s24 ; GFX11-NEXT: s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s2, 1 -; GFX11-NEXT: s_lshr_b32 s2, s2, s30 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_mul_i32 s2, s2, s22 -; GFX11-NEXT: s_mul_i32 s2, s2, s20 -; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(NEXT) | instid1(SALU_CYCLE_1) -; GFX11-NEXT: s_or_b32 s2, s24, s2 +; GFX11-NEXT: s_or_b32 s3, s3, 1 +; GFX11-NEXT: s_lshr_b32 s3, s3, s30 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) | instskip(SKIP_2) | instid1(SALU_CYCLE_1) +; GFX11-NEXT: s_mul_i32 s9, s3, s22 +; GFX11-NEXT: s_mov_b32 s3, 0 +; GFX11-NEXT: s_mul_i32 s9, s9, s20 +; GFX11-NEXT: s_or_b32 s2, s2, s9 +; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) ; GFX11-NEXT: s_lshl_b64 s[22:23], s[2:3], 1 ; GFX11-NEXT: global_load_u16 v2, v1, s[22:23] ; GFX11-NEXT: s_waitcnt vmcnt(0) @@ -152,10 +151,10 @@ ; GFX11-NEXT: s_or_b32 s3, s2, s3 ; GFX11-NEXT: s_cbranch_vccz .LBB2_6 ; GFX11-NEXT: ; %bb.7: ; %Flow -; GFX11-NEXT: s_mov_b32 s2, 0 +; GFX11-NEXT: s_mov_b32 s3, 0 ; GFX11-NEXT: .LBB2_8: ; %Flow12 ; GFX11-NEXT: s_delay_alu instid0(SALU_CYCLE_1) -; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s2 +; GFX11-NEXT: s_and_b32 vcc_lo, exec_lo, s3 ; GFX11-NEXT: s_cbranch_vccz .LBB2_12 ; GFX11-NEXT: ; %bb.9: ; GFX11-NEXT: s_xor_b32 s0, s8, -1 @@ -167,11 +166,11 @@ ; GFX11-NEXT: ; %bb.11: ; %Flow6 ; GFX11-NEXT: s_mov_b32 s21, -1 ; GFX11-NEXT: .LBB2_12: ; %Flow11 -; GFX11-NEXT: s_and_b32 s3, s1, exec_lo +; GFX11-NEXT: s_and_b32 s20, s1, exec_lo ; GFX11-NEXT: s_or_not1_b32 s0, s21, exec_lo ; GFX11-NEXT: .LBB2_13: ; %Flow9 ; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19 -; GFX11-NEXT: s_and_saveexec_b32 s19, s0 +; GFX11-NEXT: s_and_saveexec_b32 s2, s0 ; GFX11-NEXT: s_cbranch_execz .LBB2_15 ; GFX11-NEXT: ; %bb.14: ; %bb43 ; GFX11-NEXT: s_add_u32 s8, s16, 0x58 @@ -184,10 +183,10 @@ ; GFX11-NEXT: s_mov_b32 s14, s15 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] -; GFX11-NEXT: s_or_b32 s3, s3, exec_lo +; GFX11-NEXT: s_or_b32 s20, s20, exec_lo ; GFX11-NEXT: .LBB2_15: ; %Flow14 -; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s19 -; GFX11-NEXT: s_and_saveexec_b32 s0, s3 +; GFX11-NEXT: s_or_b32 exec_lo, exec_lo, s2 +; GFX11-NEXT: s_and_saveexec_b32 s0, s20 ; GFX11-NEXT: ; %bb.16: ; %UnifiedUnreachableBlock ; GFX11-NEXT: ; divergent unreachable ; GFX11-NEXT: ; %bb.17: ; %UnifiedReturnBlock diff --git a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll --- a/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll +++ b/llvm/test/CodeGen/AMDGPU/kernel-vgpr-spill-mubuf-with-voffset.ll @@ -13,30 +13,17 @@ ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: v_writelane_b32 v3, s16, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s12, s33, 0x100200 -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], s12 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: v_writelane_b32 v40, s16, 0 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 -; CHECK-NEXT: v_readlane_b32 s14, v3, 0 +; CHECK-NEXT: v_readlane_b32 s14, v40, 0 ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] ; CHECK-NEXT: v_mov_b32_e32 v3, v2 ; CHECK-NEXT: v_mov_b32_e32 v2, v1 ; CHECK-NEXT: v_mov_b32_e32 v1, v0 -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s8 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_load_dword s8, s[16:17], 0x0 -; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v0, s8, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s8, s33, 0x100200 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s8 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[34:35] +; CHECK-NEXT: s_waitcnt lgkmcnt(0) +; CHECK-NEXT: v_writelane_b32 v40, s8, 1 ; CHECK-NEXT: ;;#ASMSTART ; CHECK-NEXT: ; def vgpr10 ; CHECK-NEXT: ;;#ASMEND @@ -69,14 +56,9 @@ ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100100 ; CHECK-NEXT: buffer_load_dword v10, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_waitcnt vmcnt(1) -; CHECK-NEXT: v_readlane_b32 s4, v0, 1 +; CHECK-NEXT: v_readlane_b32 s4, v40, 1 ; CHECK-NEXT: s_mov_b32 s5, 0 ; CHECK-NEXT: s_cmp_eq_u32 s4, s5 ; CHECK-NEXT: v_mov_b32_e32 v0, 0x4000 @@ -84,24 +66,14 @@ ; CHECK-NEXT: buffer_store_dword v10, v0, s[0:3], s33 offen ; 4-byte Folded Spill ; CHECK-NEXT: s_cbranch_scc1 .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %store -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] ; CHECK-NEXT: s_add_i32 s4, s33, 0x100000 -; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s4 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s4 ; 4-byte Folded Reload ; CHECK-NEXT: ; implicit-def: $sgpr4 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 +; CHECK-NEXT: v_mov_b32_e32 v0, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: ds_write_b32 v1, v2 -; CHECK-NEXT: ; kill: killed $vgpr0 +; CHECK-NEXT: ds_write_b32 v0, v1 ; CHECK-NEXT: s_endpgm ; CHECK-NEXT: .LBB0_2: ; %end -; CHECK-NEXT: s_or_saveexec_b64 s[34:35], -1 -; CHECK-NEXT: s_add_i32 s4, s33, 0x100200 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[34:35] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm %arr = alloca < 1339 x i32>, align 8192, addrspace(5) %cmp = icmp ne i32 %val, 0 diff --git a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll --- a/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll +++ b/llvm/test/CodeGen/AMDGPU/llc-pipeline.ll @@ -116,7 +116,6 @@ ; GCN-O0-NEXT: Fast Register Allocator ; GCN-O0-NEXT: SI lower SGPR spill instructions ; GCN-O0-NEXT: Fast Register Allocator -; GCN-O0-NEXT: SI Lower WWM Copies ; GCN-O0-NEXT: SI Fix VGPR copies ; GCN-O0-NEXT: Remove Redundant DEBUG_VALUE analysis ; GCN-O0-NEXT: Fixup Statepoint Caller Saved @@ -361,7 +360,6 @@ ; GCN-O1-NEXT: Virtual Register Map ; GCN-O1-NEXT: Live Register Matrix ; GCN-O1-NEXT: Greedy Register Allocator -; GCN-O1-NEXT: SI Lower WWM Copies ; GCN-O1-NEXT: GCN NSA Reassign ; GCN-O1-NEXT: Virtual Register Rewriter ; GCN-O1-NEXT: Stack Slot Coloring @@ -655,7 +653,6 @@ ; GCN-O1-OPTS-NEXT: Virtual Register Map ; GCN-O1-OPTS-NEXT: Live Register Matrix ; GCN-O1-OPTS-NEXT: Greedy Register Allocator -; GCN-O1-OPTS-NEXT: SI Lower WWM Copies ; GCN-O1-OPTS-NEXT: GCN NSA Reassign ; GCN-O1-OPTS-NEXT: Virtual Register Rewriter ; GCN-O1-OPTS-NEXT: Stack Slot Coloring @@ -959,7 +956,6 @@ ; GCN-O2-NEXT: Virtual Register Map ; GCN-O2-NEXT: Live Register Matrix ; GCN-O2-NEXT: Greedy Register Allocator -; GCN-O2-NEXT: SI Lower WWM Copies ; GCN-O2-NEXT: GCN NSA Reassign ; GCN-O2-NEXT: Virtual Register Rewriter ; GCN-O2-NEXT: Stack Slot Coloring @@ -1275,7 +1271,6 @@ ; GCN-O3-NEXT: Virtual Register Map ; GCN-O3-NEXT: Live Register Matrix ; GCN-O3-NEXT: Greedy Register Allocator -; GCN-O3-NEXT: SI Lower WWM Copies ; GCN-O3-NEXT: GCN NSA Reassign ; GCN-O3-NEXT: Virtual Register Rewriter ; GCN-O3-NEXT: Stack Slot Coloring diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands-non-ptr-intrinsics.ll @@ -141,108 +141,82 @@ ; W64-O0: ; %bb.0: ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr5 -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v5, v3 +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v4, v3 ; W64-O0-NEXT: v_mov_b32_e32 v6, v2 ; W64-O0-NEXT: v_mov_b32_e32 v7, v1 -; W64-O0-NEXT: v_mov_b32_e32 v1, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v7 -; W64-O0-NEXT: v_mov_b32_e32 v3, v6 -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v7 +; W64-O0-NEXT: v_mov_b32_e32 v2, v6 +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 +; W64-O0-NEXT: v_writelane_b32 v5, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v5, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v5, s5, 2 ; W64-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v5, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v5, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v5, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v5, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v5, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v5, s5, 8 ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v5, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v5, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v5, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v5, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v5, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v5, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v5, 0 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v5, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v5, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -494,221 +468,185 @@ ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr13 -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v13, v4 +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v14, v4 ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v7, v3 -; W64-O0-NEXT: v_mov_b32_e32 v8, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v9, v1 +; W64-O0-NEXT: v_mov_b32_e32 v6, v3 +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v7, v2 +; W64-O0-NEXT: v_mov_b32_e32 v8, v1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v3, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v2, v0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v5 -; W64-O0-NEXT: v_mov_b32_e32 v15, v6 +; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v15, v5 ; W64-O0-NEXT: s_waitcnt vmcnt(3) ; W64-O0-NEXT: v_mov_b32_e32 v16, v4 -; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_mov_b32_e32 v17, v3 +; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v9 -; W64-O0-NEXT: v_mov_b32_e32 v5, v8 -; W64-O0-NEXT: v_mov_b32_e32 v6, v7 -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v8 +; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: v_mov_b32_e32 v5, v6 +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v12 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v12 +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v10 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v10 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v13, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 2 ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v13, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v13, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v13, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v13, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v13, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 8 ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_readlane_b32 s4, v13, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v13, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v13, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v13, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v13, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v13, 0 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 2 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 9 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v13, s4, 9 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 10 ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 11 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 12 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 14 +; W64-O0-NEXT: v_writelane_b32 v13, s8, 11 +; W64-O0-NEXT: v_writelane_b32 v13, s9, 12 +; W64-O0-NEXT: v_writelane_b32 v13, s10, 13 +; W64-O0-NEXT: v_writelane_b32 v13, s11, 14 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 16 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v13, s4, 15 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 16 ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_readlane_b32 s4, v13, 15 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 16 +; W64-O0-NEXT: v_readlane_b32 s8, v13, 11 +; W64-O0-NEXT: v_readlane_b32 s9, v13, 12 +; W64-O0-NEXT: v_readlane_b32 s10, v13, 13 +; W64-O0-NEXT: v_readlane_b32 s11, v13, 14 +; W64-O0-NEXT: v_readlane_b32 s6, v13, 0 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 ; W64-O0-NEXT: ; %bb.6: -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(6) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 9 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[4:5], v6, off +; W64-O0-NEXT: global_store_dword v[3:4], v5, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -1016,250 +954,204 @@ ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr8 -; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v8, v6 -; W64-O0-NEXT: v_mov_b32_e32 v9, v5 -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v9, v7 +; W64-O0-NEXT: v_mov_b32_e32 v10, v6 +; W64-O0-NEXT: v_mov_b32_e32 v11, v5 +; W64-O0-NEXT: v_mov_b32_e32 v5, v4 +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v10, v3 -; W64-O0-NEXT: v_mov_b32_e32 v11, v2 -; W64-O0-NEXT: v_mov_b32_e32 v13, v1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v6, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v5, v3 +; W64-O0-NEXT: v_mov_b32_e32 v6, v2 +; W64-O0-NEXT: v_mov_b32_e32 v7, v1 +; W64-O0-NEXT: v_mov_b32_e32 v13, v0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v9 -; W64-O0-NEXT: v_mov_b32_e32 v3, v8 -; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v11 +; W64-O0-NEXT: v_mov_b32_e32 v2, v10 +; W64-O0-NEXT: v_mov_b32_e32 v3, v9 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7_vgpr8_vgpr9 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v7, v13 -; W64-O0-NEXT: v_mov_b32_e32 v8, v11 -; W64-O0-NEXT: v_mov_b32_e32 v9, v10 -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v14, v7 +; W64-O0-NEXT: v_mov_b32_e32 v15, v6 +; W64-O0-NEXT: v_mov_b32_e32 v16, v5 +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v6, v12 +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v5, v12 +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: ;;#ASMSTART ; W64-O0-NEXT: s_mov_b32 s4, 17 ; W64-O0-NEXT: ;;#ASMEND ; W64-O0-NEXT: s_mov_b32 s5, s4 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 0 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 1 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 2 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 3 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 1 +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: s_mov_b64 s[4:5], exec +; W64-O0-NEXT: v_writelane_b32 v8, s4, 2 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 3 ; W64-O0-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 6 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 7 +; W64-O0-NEXT: v_writelane_b32 v8, s8, 4 +; W64-O0-NEXT: v_writelane_b32 v8, s9, 5 +; W64-O0-NEXT: v_writelane_b32 v8, s10, 6 +; W64-O0-NEXT: v_writelane_b32 v8, s11, 7 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 8 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 9 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v8, s4, 8 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 9 ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v8, 8 +; W64-O0-NEXT: v_readlane_b32 s5, v8, 9 +; W64-O0-NEXT: v_readlane_b32 s8, v8, 4 +; W64-O0-NEXT: v_readlane_b32 s9, v8, 5 +; W64-O0-NEXT: v_readlane_b32 s10, v8, 6 +; W64-O0-NEXT: v_readlane_b32 s11, v8, 7 +; W64-O0-NEXT: v_readlane_b32 s6, v8, 1 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 -; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readlane_b32 s6, v0, 2 -; W64-O0-NEXT: v_readlane_b32 s7, v0, 3 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s6, v8, 2 +; W64-O0-NEXT: v_readlane_b32 s7, v8, 3 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 +; W64-O0-NEXT: v_readlane_b32 s4, v8, 1 ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5 -; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 +; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 10 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 11 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v8, s4, 10 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 11 ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execz .LBB2_8 ; W64-O0-NEXT: ; %bb.4: ; %bb1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 0 +; W64-O0-NEXT: v_readlane_b32 s4, v8, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 12 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: v_writelane_b32 v8, s5, 12 +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5_sgpr6_sgpr7 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 14 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v8, s4, 13 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 14 ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 16 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 17 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 18 +; W64-O0-NEXT: v_writelane_b32 v8, s8, 15 +; W64-O0-NEXT: v_writelane_b32 v8, s9, 16 +; W64-O0-NEXT: v_writelane_b32 v8, s10, 17 +; W64-O0-NEXT: v_writelane_b32 v8, s11, 18 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 19 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 20 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v8, s4, 19 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 20 ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v8, 19 +; W64-O0-NEXT: v_readlane_b32 s5, v8, 20 +; W64-O0-NEXT: v_readlane_b32 s8, v8, 15 +; W64-O0-NEXT: v_readlane_b32 s9, v8, 16 +; W64-O0-NEXT: v_readlane_b32 s10, v8, 17 +; W64-O0-NEXT: v_readlane_b32 s11, v8, 18 +; W64-O0-NEXT: v_readlane_b32 s6, v8, 12 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 19 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 20 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 17 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 18 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 12 -; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 ; W64-O0-NEXT: ; %bb.7: -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 14 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v8, 13 +; W64-O0-NEXT: v_readlane_b32 s5, v8, 14 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v8, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v8, 11 ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll --- a/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll +++ b/llvm/test/CodeGen/AMDGPU/mubuf-legalize-operands.ll @@ -140,123 +140,94 @@ ; W64-O0: ; %bb.0: ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr5 -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: v_mov_b32_e32 v6, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v3, v1 -; W64-O0-NEXT: v_mov_b32_e32 v1, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v2, v1 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 def $vgpr6_vgpr7 killed $exec -; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_mov_b32_e32 v7, v2 -; W64-O0-NEXT: v_mov_b32_e32 v5, v7 +; W64-O0-NEXT: v_mov_b32_e32 v7, v3 +; W64-O0-NEXT: v_mov_b32_e32 v4, v7 ; W64-O0-NEXT: ; kill: def $vgpr6 killed $vgpr6 killed $vgpr6_vgpr7 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v3 -; W64-O0-NEXT: v_mov_b32_e32 v7, v2 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 killed $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v2 +; W64-O0-NEXT: v_mov_b32_e32 v7, v1 +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 killed $vgpr0_vgpr1 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v7 -; W64-O0-NEXT: v_mov_b32_e32 v3, v6 -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v7 +; W64-O0-NEXT: v_mov_b32_e32 v2, v6 +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 +; W64-O0-NEXT: v_writelane_b32 v5, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v5, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v5, s5, 2 ; W64-O0-NEXT: .LBB0_1: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v5, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v5, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v5, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v5, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v5, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v5, s5, 8 ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB0_1 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v5, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v5, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v5, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v5, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v5, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v5, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v5, 0 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB0_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(1) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 2 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v5, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v5, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; kill: killed $vgpr1 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -508,89 +479,86 @@ ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr13 -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v14, v6 -; W64-O0-NEXT: v_mov_b32_e32 v9, v5 -; W64-O0-NEXT: v_mov_b32_e32 v13, v4 -; W64-O0-NEXT: v_mov_b32_e32 v4, v3 -; W64-O0-NEXT: v_mov_b32_e32 v8, v2 -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v5, v1 +; W64-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v15, v6 +; W64-O0-NEXT: v_mov_b32_e32 v8, v5 +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; W64-O0-NEXT: v_mov_b32_e32 v14, v4 +; W64-O0-NEXT: v_mov_b32_e32 v7, v2 +; W64-O0-NEXT: v_mov_b32_e32 v4, v1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v3, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v2, v0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v15, v7 +; W64-O0-NEXT: ; kill: def $vgpr15 killed $vgpr15 def $vgpr15_vgpr16 killed $exec +; W64-O0-NEXT: s_waitcnt vmcnt(2) +; W64-O0-NEXT: v_mov_b32_e32 v16, v5 +; W64-O0-NEXT: v_mov_b32_e32 v5, v16 ; W64-O0-NEXT: v_mov_b32_e32 v6, v15 -; W64-O0-NEXT: v_mov_b32_e32 v7, v14 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v9 -; W64-O0-NEXT: v_mov_b32_e32 v9, v14 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 killed $vgpr13_vgpr14 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v15, v8 +; W64-O0-NEXT: v_mov_b32_e32 v8, v15 +; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 killed $vgpr14_vgpr15 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v9 -; W64-O0-NEXT: v_mov_b32_e32 v15, v7 +; W64-O0-NEXT: ; kill: def $vgpr14 killed $vgpr14 def $vgpr14_vgpr15_vgpr16_vgpr17 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v15, v8 ; W64-O0-NEXT: v_mov_b32_e32 v16, v6 -; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v17, v5 +; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v17, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v9, v4 -; W64-O0-NEXT: v_mov_b32_e32 v7, v9 -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 def $vgpr7_vgpr8 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v8, v3 +; W64-O0-NEXT: v_mov_b32_e32 v6, v8 +; W64-O0-NEXT: ; kill: def $vgpr7 killed $vgpr7 killed $vgpr7_vgpr8 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: v_mov_b32_e32 v9, v4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 killed $vgpr3_vgpr4 killed $exec +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: v_mov_b32_e32 v8, v3 +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 killed $vgpr2_vgpr3 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4_vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v9 -; W64-O0-NEXT: v_mov_b32_e32 v5, v8 -; W64-O0-NEXT: v_mov_b32_e32 v6, v7 -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3_vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v8 +; W64-O0-NEXT: v_mov_b32_e32 v4, v7 +; W64-O0-NEXT: v_mov_b32_e32 v5, v6 +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v3, v12 -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v2, v12 +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v10 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v10 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 @@ -598,156 +566,122 @@ ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: s_mov_b32 s4, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s4, 0 +; W64-O0-NEXT: v_writelane_b32 v13, s4, 0 ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 1 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 2 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v13, s4, 1 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 2 ; W64-O0-NEXT: .LBB1_1: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 3 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 6 +; W64-O0-NEXT: v_writelane_b32 v13, s8, 3 +; W64-O0-NEXT: v_writelane_b32 v13, s9, 4 +; W64-O0-NEXT: v_writelane_b32 v13, s10, 5 +; W64-O0-NEXT: v_writelane_b32 v13, s11, 6 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 7 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 8 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v13, s4, 7 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 8 ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB1_1 Depth=1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_readlane_b32 s4, v13, 7 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 8 +; W64-O0-NEXT: v_readlane_b32 s8, v13, 3 +; W64-O0-NEXT: v_readlane_b32 s9, v13, 4 +; W64-O0-NEXT: v_readlane_b32 s10, v13, 5 +; W64-O0-NEXT: v_readlane_b32 s11, v13, 6 +; W64-O0-NEXT: v_readlane_b32 s6, v13, 0 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 3 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 2 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 1 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 2 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 9 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 10 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v13, s4, 9 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 10 ; W64-O0-NEXT: .LBB1_4: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 11 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 12 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 14 +; W64-O0-NEXT: v_writelane_b32 v13, s8, 11 +; W64-O0-NEXT: v_writelane_b32 v13, s9, 12 +; W64-O0-NEXT: v_writelane_b32 v13, s10, 13 +; W64-O0-NEXT: v_writelane_b32 v13, s11, 14 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 16 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v13, s4, 15 +; W64-O0-NEXT: v_writelane_b32 v13, s5, 16 ; W64-O0-NEXT: ; %bb.5: ; in Loop: Header=BB1_4 Depth=1 ; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_readlane_b32 s4, v13, 15 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 16 +; W64-O0-NEXT: v_readlane_b32 s8, v13, 11 +; W64-O0-NEXT: v_readlane_b32 s9, v13, 12 +; W64-O0-NEXT: v_readlane_b32 s10, v13, 13 +; W64-O0-NEXT: v_readlane_b32 s11, v13, 14 +; W64-O0-NEXT: v_readlane_b32 s6, v13, 0 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 11 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 12 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 14 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 0 -; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB1_4 ; W64-O0-NEXT: ; %bb.6: -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(6) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 9 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 10 +; W64-O0-NEXT: v_readlane_b32 s4, v13, 9 +; W64-O0-NEXT: v_readlane_b32 s5, v13, 10 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[4:5], v6, off +; W64-O0-NEXT: global_store_dword v[3:4], v5, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v13, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] @@ -1055,288 +989,239 @@ ; W64-O0: ; %bb.0: ; %entry ; W64-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: ; implicit-def: $vgpr8 -; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v6, v5 -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_store_dword v31, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v5, v4 +; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload ; W64-O0-NEXT: s_nop 0 -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; W64-O0-NEXT: v_mov_b32_e32 v4, v3 -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v13, v2 +; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; W64-O0-NEXT: v_mov_b32_e32 v10, v2 ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v10, v1 +; W64-O0-NEXT: v_mov_b32_e32 v6, v1 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:48 ; 4-byte Folded Reload -; W64-O0-NEXT: v_mov_b32_e32 v8, v0 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_mov_b32_e32 v9, v0 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v14, v4 -; W64-O0-NEXT: v_mov_b32_e32 v4, v14 -; W64-O0-NEXT: v_mov_b32_e32 v6, v13 +; W64-O0-NEXT: ; kill: def $vgpr10 killed $vgpr10 def $vgpr10_vgpr11 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v11, v3 +; W64-O0-NEXT: v_mov_b32_e32 v3, v11 +; W64-O0-NEXT: v_mov_b32_e32 v5, v10 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v9, v10 +; W64-O0-NEXT: ; kill: def $vgpr9 killed $vgpr9 def $vgpr9_vgpr10 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v10, v6 +; W64-O0-NEXT: v_mov_b32_e32 v6, v10 ; W64-O0-NEXT: v_mov_b32_e32 v13, v9 -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 killed $vgpr8_vgpr9 killed $exec ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr8 killed $vgpr8 def $vgpr8_vgpr9_vgpr10_vgpr11 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v9, v13 -; W64-O0-NEXT: v_mov_b32_e32 v10, v6 -; W64-O0-NEXT: v_mov_b32_e32 v11, v4 -; W64-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr13 killed $vgpr13 def $vgpr13_vgpr14_vgpr15_vgpr16 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v14, v6 +; W64-O0-NEXT: v_mov_b32_e32 v15, v5 +; W64-O0-NEXT: v_mov_b32_e32 v16, v3 +; W64-O0-NEXT: buffer_store_dword v13, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v11, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v14, off, s[0:3], s32 offset:32 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v15, off, s[0:3], s32 offset:36 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v16, off, s[0:3], s32 offset:40 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr5 killed $vgpr5 def $vgpr5_vgpr6 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v6, v7 +; W64-O0-NEXT: ; kill: def $vgpr4 killed $vgpr4 def $vgpr4_vgpr5 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v5, v7 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr3 killed $vgpr3 def $vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v4, v2 +; W64-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v3, v1 ; W64-O0-NEXT: ; implicit-def: $sgpr4 ; W64-O0-NEXT: ; implicit-def: $sgpr4 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v12 +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v12 +; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr4_sgpr5 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; W64-O0-NEXT: ;;#ASMSTART ; W64-O0-NEXT: s_mov_b32 s4, 17 ; W64-O0-NEXT: ;;#ASMEND ; W64-O0-NEXT: s_mov_b32 s5, s4 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 0 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 0 ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 1 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 2 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 3 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 1 +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 ; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: s_mov_b64 s[4:5], exec +; W64-O0-NEXT: v_writelane_b32 v8, s4, 2 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 3 ; W64-O0-NEXT: .LBB2_1: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:32 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:36 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:40 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:44 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 4 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 5 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 6 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 7 +; W64-O0-NEXT: v_writelane_b32 v8, s8, 4 +; W64-O0-NEXT: v_writelane_b32 v8, s9, 5 +; W64-O0-NEXT: v_writelane_b32 v8, s10, 6 +; W64-O0-NEXT: v_writelane_b32 v8, s11, 7 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 8 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 9 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v8, s4, 8 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 9 ; W64-O0-NEXT: ; %bb.2: ; in Loop: Header=BB2_1 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v8, 8 +; W64-O0-NEXT: v_readlane_b32 s5, v8, 9 +; W64-O0-NEXT: v_readlane_b32 s8, v8, 4 +; W64-O0-NEXT: v_readlane_b32 s9, v8, 5 +; W64-O0-NEXT: v_readlane_b32 s10, v8, 6 +; W64-O0-NEXT: v_readlane_b32 s11, v8, 7 +; W64-O0-NEXT: v_readlane_b32 s6, v8, 1 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 8 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 9 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 4 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 5 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 6 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 7 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 1 -; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_1 ; W64-O0-NEXT: ; %bb.3: -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(2) -; W64-O0-NEXT: v_readlane_b32 s6, v0, 2 -; W64-O0-NEXT: v_readlane_b32 s7, v0, 3 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s6, v8, 2 +; W64-O0-NEXT: v_readlane_b32 s7, v8, 3 ; W64-O0-NEXT: s_mov_b64 exec, s[6:7] -; W64-O0-NEXT: v_readlane_b32 s4, v0, 1 +; W64-O0-NEXT: v_readlane_b32 s4, v8, 1 ; W64-O0-NEXT: s_mov_b32 s5, 0x3ff ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_and_b32_e64 v2, v2, s5 -; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: v_and_b32_e64 v1, v1, s5 +; W64-O0-NEXT: v_cmp_eq_u32_e64 s[6:7], v1, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 10 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 11 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v8, s4, 10 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 11 ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execz .LBB2_8 ; W64-O0-NEXT: ; %bb.4: ; %bb1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(4) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 0 +; W64-O0-NEXT: v_readlane_b32 s4, v8, 0 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_mov_b32_e32 v7, v5 -; W64-O0-NEXT: v_mov_b32_e32 v1, v4 -; W64-O0-NEXT: v_mov_b32_e32 v5, v3 -; W64-O0-NEXT: v_mov_b32_e32 v6, v2 +; W64-O0-NEXT: v_mov_b32_e32 v6, v4 +; W64-O0-NEXT: v_mov_b32_e32 v0, v3 +; W64-O0-NEXT: v_mov_b32_e32 v4, v2 +; W64-O0-NEXT: v_mov_b32_e32 v5, v1 ; W64-O0-NEXT: ; implicit-def: $sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr5 ; W64-O0-NEXT: ; implicit-def: $sgpr5 -; W64-O0-NEXT: ; kill: def $vgpr1 killed $vgpr1 def $vgpr1_vgpr2_vgpr3_vgpr4 killed $exec -; W64-O0-NEXT: v_mov_b32_e32 v2, v7 -; W64-O0-NEXT: v_mov_b32_e32 v3, v6 -; W64-O0-NEXT: v_mov_b32_e32 v4, v5 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; W64-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3 killed $exec +; W64-O0-NEXT: v_mov_b32_e32 v1, v6 +; W64-O0-NEXT: v_mov_b32_e32 v2, v5 +; W64-O0-NEXT: v_mov_b32_e32 v3, v4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill ; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill ; W64-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; W64-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b32 s5, 0 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 12 -; W64-O0-NEXT: v_mov_b32_e32 v1, s4 -; W64-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; W64-O0-NEXT: v_writelane_b32 v8, s5, 12 +; W64-O0-NEXT: v_mov_b32_e32 v0, s4 +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill ; W64-O0-NEXT: s_mov_b64 s[4:5], exec -; W64-O0-NEXT: v_writelane_b32 v0, s4, 13 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 14 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v8, s4, 13 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 14 ; W64-O0-NEXT: .LBB2_5: ; =>This Inner Loop Header: Depth=1 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload ; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readfirstlane_b32 s8, v1 -; W64-O0-NEXT: v_readfirstlane_b32 s12, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s8, v0 +; W64-O0-NEXT: v_readfirstlane_b32 s12, v1 ; W64-O0-NEXT: s_mov_b32 s4, s8 ; W64-O0-NEXT: s_mov_b32 s5, s12 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[1:2] -; W64-O0-NEXT: v_readfirstlane_b32 s7, v3 -; W64-O0-NEXT: v_readfirstlane_b32 s6, v4 +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[4:5], s[4:5], v[0:1] +; W64-O0-NEXT: v_readfirstlane_b32 s7, v2 +; W64-O0-NEXT: v_readfirstlane_b32 s6, v3 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[3:4] +; W64-O0-NEXT: v_cmp_eq_u64_e64 s[10:11], s[10:11], v[2:3] ; W64-O0-NEXT: s_and_b64 s[4:5], s[4:5], s[10:11] ; W64-O0-NEXT: ; kill: def $sgpr8 killed $sgpr8 def $sgpr8_sgpr9_sgpr10_sgpr11 ; W64-O0-NEXT: s_mov_b32 s9, s12 ; W64-O0-NEXT: s_mov_b32 s10, s7 ; W64-O0-NEXT: s_mov_b32 s11, s6 -; W64-O0-NEXT: v_writelane_b32 v0, s8, 15 -; W64-O0-NEXT: v_writelane_b32 v0, s9, 16 -; W64-O0-NEXT: v_writelane_b32 v0, s10, 17 -; W64-O0-NEXT: v_writelane_b32 v0, s11, 18 +; W64-O0-NEXT: v_writelane_b32 v8, s8, 15 +; W64-O0-NEXT: v_writelane_b32 v8, s9, 16 +; W64-O0-NEXT: v_writelane_b32 v8, s10, 17 +; W64-O0-NEXT: v_writelane_b32 v8, s11, 18 ; W64-O0-NEXT: s_and_saveexec_b64 s[4:5], s[4:5] -; W64-O0-NEXT: v_writelane_b32 v0, s4, 19 -; W64-O0-NEXT: v_writelane_b32 v0, s5, 20 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: v_writelane_b32 v8, s4, 19 +; W64-O0-NEXT: v_writelane_b32 v8, s5, 20 ; W64-O0-NEXT: ; %bb.6: ; in Loop: Header=BB2_5 Depth=1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v8, 19 +; W64-O0-NEXT: v_readlane_b32 s5, v8, 20 +; W64-O0-NEXT: v_readlane_b32 s8, v8, 15 +; W64-O0-NEXT: v_readlane_b32 s9, v8, 16 +; W64-O0-NEXT: v_readlane_b32 s10, v8, 17 +; W64-O0-NEXT: v_readlane_b32 s11, v8, 18 +; W64-O0-NEXT: v_readlane_b32 s6, v8, 12 ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 19 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 20 -; W64-O0-NEXT: v_readlane_b32 s8, v1, 15 -; W64-O0-NEXT: v_readlane_b32 s9, v1, 16 -; W64-O0-NEXT: v_readlane_b32 s10, v1, 17 -; W64-O0-NEXT: v_readlane_b32 s11, v1, 18 -; W64-O0-NEXT: v_readlane_b32 s6, v1, 12 -; W64-O0-NEXT: s_nop 4 +; W64-O0-NEXT: s_nop 3 ; W64-O0-NEXT: buffer_load_format_x v0, v0, s[8:11], s6 idxen ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill ; W64-O0-NEXT: s_xor_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_cbranch_execnz .LBB2_5 ; W64-O0-NEXT: ; %bb.7: -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] -; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: v_readlane_b32 s4, v1, 13 -; W64-O0-NEXT: v_readlane_b32 s5, v1, 14 +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v8, 13 +; W64-O0-NEXT: v_readlane_b32 s5, v8, 14 ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] -; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; W64-O0-NEXT: s_waitcnt vmcnt(0) +; W64-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill ; W64-O0-NEXT: .LBB2_8: ; %bb2 -; W64-O0-NEXT: s_or_saveexec_b64 s[16:17], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; W64-O0-NEXT: s_mov_b64 exec, s[16:17] +; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; W64-O0-NEXT: s_nop 0 ; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; W64-O0-NEXT: s_waitcnt vmcnt(3) -; W64-O0-NEXT: v_readlane_b32 s4, v0, 10 -; W64-O0-NEXT: v_readlane_b32 s5, v0, 11 +; W64-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; W64-O0-NEXT: v_readlane_b32 s4, v8, 10 +; W64-O0-NEXT: v_readlane_b32 s5, v8, 11 ; W64-O0-NEXT: s_or_b64 exec, exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: global_store_dword v[1:2], v3, off +; W64-O0-NEXT: global_store_dword v[0:1], v2, off ; W64-O0-NEXT: s_waitcnt vmcnt(0) -; W64-O0-NEXT: ; kill: killed $vgpr0 ; W64-O0-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; W64-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; W64-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; W64-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; W64-O0-NEXT: s_mov_b64 exec, s[4:5] ; W64-O0-NEXT: s_waitcnt vmcnt(0) ; W64-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll --- a/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll +++ b/llvm/test/CodeGen/AMDGPU/mul24-pass-ordering.ll @@ -192,12 +192,13 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] -; GFX9-NEXT: v_writelane_b32 v40, s4, 5 ; GFX9-NEXT: v_writelane_b32 v40, s30, 0 ; GFX9-NEXT: v_writelane_b32 v40, s31, 1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 ; GFX9-NEXT: v_writelane_b32 v40, s34, 2 +; GFX9-NEXT: v_writelane_b32 v44, s4, 0 ; GFX9-NEXT: v_writelane_b32 v40, s36, 3 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, foo@gotpcrel32@lo+4 @@ -229,9 +230,10 @@ ; GFX9-NEXT: v_readlane_b32 s34, v40, 2 ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 5 +; GFX9-NEXT: v_readlane_b32 s4, v44, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v44, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/need-fp-from-vgpr-spills.ll @@ -27,7 +27,7 @@ ; CHECK-LABEL: csr_vgpr_spill_fp_callee: ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s24, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill @@ -54,7 +54,7 @@ ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_mov_b32 s33, s24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] bb: @@ -66,16 +66,12 @@ define amdgpu_kernel void @kernel_call() { ; CHECK-LABEL: kernel_call: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b32 s32, 0x400 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: v_writelane_b32 v3, s16, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[24:25] ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: v_readlane_b32 s14, v3, 0 @@ -93,10 +89,6 @@ ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[24:25] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm bb: tail call fastcc void @csr_vgpr_spill_fp_callee() @@ -121,9 +113,9 @@ ; CHECK-NEXT: s_addc_u32 s17, s17, callee_has_fp@rel32@hi+12 ; CHECK-NEXT: v_readlane_b32 s33, v1, 0 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_xor_saveexec_b64 s[18:19], -1 +; CHECK-NEXT: s_xor_saveexec_b64 s[20:21], -1 ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[18:19] +; CHECK-NEXT: s_mov_b64 exec, s[20:21] ; CHECK-NEXT: s_setpc_b64 s[16:17] bb: call void asm sideeffect "; clobber csr v40", "~{v40}"() @@ -134,16 +126,12 @@ define amdgpu_kernel void @kernel_tailcall() { ; CHECK-LABEL: kernel_tailcall: ; CHECK: ; %bb.0: ; %bb -; CHECK-NEXT: s_mov_b32 s32, 0x400 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: v_writelane_b32 v3, s16, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[24:25] ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: v_readlane_b32 s14, v3, 0 @@ -161,10 +149,6 @@ ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[24:25] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm bb: tail call fastcc void @csr_vgpr_spill_fp_tailcall_callee() @@ -188,7 +172,7 @@ ; CHECK-LABEL: caller_save_vgpr_spill_fp_tail_call: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s18, s33 +; CHECK-NEXT: s_mov_b32 s24, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v1, off, s[0:3], s33 ; 4-byte Folded Spill @@ -210,7 +194,7 @@ ; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s18 +; CHECK-NEXT: s_mov_b32 s33, s24 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -222,7 +206,7 @@ ; CHECK-LABEL: caller_save_vgpr_spill_fp: ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_mov_b32 s19, s33 +; CHECK-NEXT: s_mov_b32 s25, s33 ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_xor_saveexec_b64 s[16:17], -1 ; CHECK-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill @@ -244,7 +228,7 @@ ; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], s33 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; CHECK-NEXT: s_mov_b32 s33, s19 +; CHECK-NEXT: s_mov_b32 s33, s25 ; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: s_setpc_b64 s[30:31] entry: @@ -255,16 +239,12 @@ define protected amdgpu_kernel void @kernel() { ; CHECK-LABEL: kernel: ; CHECK: ; %bb.0: ; %entry -; CHECK-NEXT: s_mov_b32 s32, 0x400 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 flat_scratch_lo, s12, s17 ; CHECK-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr3 ; CHECK-NEXT: v_writelane_b32 v3, s16, 0 -; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[24:25] ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 ; CHECK-NEXT: v_readlane_b32 s14, v3, 0 @@ -282,11 +262,6 @@ ; CHECK-NEXT: s_mov_b64 s[0:1], s[20:21] ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: ; kill: def $vgpr1 killed $vgpr0 killed $exec -; CHECK-NEXT: s_or_saveexec_b64 s[24:25], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[24:25] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm entry: %call = call i32 @caller_save_vgpr_spill_fp() diff --git a/llvm/test/CodeGen/AMDGPU/nested-calls.ll b/llvm/test/CodeGen/AMDGPU/nested-calls.ll --- a/llvm/test/CodeGen/AMDGPU/nested-calls.ll +++ b/llvm/test/CodeGen/AMDGPU/nested-calls.ll @@ -15,8 +15,9 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 [[COPY_EXEC0:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC0]] -; GCN-DAG: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 2 +; GCN-DAG: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN-DAG: v_writelane_b32 v40, s30, 0 ; GCN-DAG: v_writelane_b32 v40, s31, 1 @@ -25,9 +26,10 @@ ; GCN: v_readlane_b32 s31, v40, 1 ; GCN: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 2 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 ; GCN: s_or_saveexec_b64 [[COPY_EXEC1:s\[[0-9]+:[0-9]+\]]], -1{{$}} ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, [[COPY_EXEC1]] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] diff --git a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll --- a/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll +++ b/llvm/test/CodeGen/AMDGPU/no-source-locations-in-prologue.ll @@ -17,8 +17,9 @@ ; CHECK-NEXT: s_mov_b32 s33, s32 ; CHECK-NEXT: s_or_saveexec_b64 s[18:19], -1 ; CHECK-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: s_mov_b64 exec, s[18:19] -; CHECK-NEXT: v_writelane_b32 v40, s16, 2 +; CHECK-NEXT: v_writelane_b32 v41, s16, 0 ; CHECK-NEXT: s_add_i32 s32, s32, 0x400 ; CHECK-NEXT: v_writelane_b32 v40, s30, 0 ; CHECK-NEXT: v_writelane_b32 v40, s31, 1 @@ -38,11 +39,12 @@ ; CHECK-NEXT: .loc 0 32 1 ; lane-info.cpp:32:1 ; CHECK-NEXT: v_readlane_b32 s31, v40, 1 ; CHECK-NEXT: v_readlane_b32 s30, v40, 0 -; CHECK-NEXT: v_readlane_b32 s4, v40, 2 +; CHECK-NEXT: v_readlane_b32 s4, v41, 0 ; CHECK-NEXT: s_or_saveexec_b64 s[6:7], -1 ; CHECK-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_mov_b64 exec, s[6:7] -; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 +; CHECK-NEXT: .loc 0 32 1 epilogue_begin is_stmt 0 ; lane-info.cpp:32:1 ; CHECK-NEXT: s_add_i32 s32, s32, 0xfffffc00 ; CHECK-NEXT: s_mov_b32 s33, s4 ; CHECK-NEXT: s_waitcnt vmcnt(0) diff --git a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll --- a/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll +++ b/llvm/test/CodeGen/AMDGPU/partial-sgpr-to-vgpr-spills.ll @@ -11,107 +11,95 @@ define amdgpu_kernel void @spill_sgprs_to_multiple_vgprs(ptr addrspace(1) %out, i32 %in) #0 { ; GCN-LABEL: spill_sgprs_to_multiple_vgprs: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s92, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s93, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s94, -1 -; GCN-NEXT: s_mov_b32 s95, 0xe8f000 -; GCN-NEXT: s_add_u32 s92, s92, s11 -; GCN-NEXT: s_addc_u32 s93, s93, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 -; GCN-NEXT: ; implicit-def: $vgpr2 ; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 0 -; GCN-NEXT: v_writelane_b32 v2, s5, 1 -; GCN-NEXT: v_writelane_b32 v2, s6, 2 -; GCN-NEXT: v_writelane_b32 v2, s7, 3 -; GCN-NEXT: v_writelane_b32 v2, s8, 4 -; GCN-NEXT: v_writelane_b32 v2, s9, 5 -; GCN-NEXT: v_writelane_b32 v2, s10, 6 -; GCN-NEXT: v_writelane_b32 v2, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 8 -; GCN-NEXT: v_writelane_b32 v2, s5, 9 -; GCN-NEXT: v_writelane_b32 v2, s6, 10 -; GCN-NEXT: v_writelane_b32 v2, s7, 11 -; GCN-NEXT: v_writelane_b32 v2, s8, 12 -; GCN-NEXT: v_writelane_b32 v2, s9, 13 -; GCN-NEXT: v_writelane_b32 v2, s10, 14 -; GCN-NEXT: v_writelane_b32 v2, s11, 15 +; GCN-NEXT: v_writelane_b32 v0, s4, 8 +; GCN-NEXT: v_writelane_b32 v0, s5, 9 +; GCN-NEXT: v_writelane_b32 v0, s6, 10 +; GCN-NEXT: v_writelane_b32 v0, s7, 11 +; GCN-NEXT: v_writelane_b32 v0, s8, 12 +; GCN-NEXT: v_writelane_b32 v0, s9, 13 +; GCN-NEXT: v_writelane_b32 v0, s10, 14 +; GCN-NEXT: v_writelane_b32 v0, s11, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 16 -; GCN-NEXT: v_writelane_b32 v2, s5, 17 -; GCN-NEXT: v_writelane_b32 v2, s6, 18 -; GCN-NEXT: v_writelane_b32 v2, s7, 19 -; GCN-NEXT: v_writelane_b32 v2, s8, 20 -; GCN-NEXT: v_writelane_b32 v2, s9, 21 -; GCN-NEXT: v_writelane_b32 v2, s10, 22 -; GCN-NEXT: v_writelane_b32 v2, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 24 -; GCN-NEXT: v_writelane_b32 v2, s5, 25 -; GCN-NEXT: v_writelane_b32 v2, s6, 26 -; GCN-NEXT: v_writelane_b32 v2, s7, 27 -; GCN-NEXT: v_writelane_b32 v2, s8, 28 -; GCN-NEXT: v_writelane_b32 v2, s9, 29 -; GCN-NEXT: v_writelane_b32 v2, s10, 30 -; GCN-NEXT: v_writelane_b32 v2, s11, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 24 +; GCN-NEXT: v_writelane_b32 v0, s5, 25 +; GCN-NEXT: v_writelane_b32 v0, s6, 26 +; GCN-NEXT: v_writelane_b32 v0, s7, 27 +; GCN-NEXT: v_writelane_b32 v0, s8, 28 +; GCN-NEXT: v_writelane_b32 v0, s9, 29 +; GCN-NEXT: v_writelane_b32 v0, s10, 30 +; GCN-NEXT: v_writelane_b32 v0, s11, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 32 -; GCN-NEXT: v_writelane_b32 v2, s5, 33 -; GCN-NEXT: v_writelane_b32 v2, s6, 34 -; GCN-NEXT: v_writelane_b32 v2, s7, 35 -; GCN-NEXT: v_writelane_b32 v2, s8, 36 -; GCN-NEXT: v_writelane_b32 v2, s9, 37 -; GCN-NEXT: v_writelane_b32 v2, s10, 38 -; GCN-NEXT: v_writelane_b32 v2, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 40 -; GCN-NEXT: v_writelane_b32 v2, s5, 41 -; GCN-NEXT: v_writelane_b32 v2, s6, 42 -; GCN-NEXT: v_writelane_b32 v2, s7, 43 -; GCN-NEXT: v_writelane_b32 v2, s8, 44 -; GCN-NEXT: v_writelane_b32 v2, s9, 45 -; GCN-NEXT: v_writelane_b32 v2, s10, 46 -; GCN-NEXT: v_writelane_b32 v2, s11, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 40 +; GCN-NEXT: v_writelane_b32 v0, s5, 41 +; GCN-NEXT: v_writelane_b32 v0, s6, 42 +; GCN-NEXT: v_writelane_b32 v0, s7, 43 +; GCN-NEXT: v_writelane_b32 v0, s8, 44 +; GCN-NEXT: v_writelane_b32 v0, s9, 45 +; GCN-NEXT: v_writelane_b32 v0, s10, 46 +; GCN-NEXT: v_writelane_b32 v0, s11, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 48 -; GCN-NEXT: v_writelane_b32 v2, s5, 49 -; GCN-NEXT: v_writelane_b32 v2, s6, 50 -; GCN-NEXT: v_writelane_b32 v2, s7, 51 -; GCN-NEXT: v_writelane_b32 v2, s8, 52 -; GCN-NEXT: v_writelane_b32 v2, s9, 53 -; GCN-NEXT: v_writelane_b32 v2, s10, 54 -; GCN-NEXT: v_writelane_b32 v2, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v2, s4, 56 -; GCN-NEXT: v_writelane_b32 v2, s5, 57 -; GCN-NEXT: v_writelane_b32 v2, s6, 58 -; GCN-NEXT: v_writelane_b32 v2, s7, 59 -; GCN-NEXT: v_writelane_b32 v2, s8, 60 -; GCN-NEXT: v_writelane_b32 v2, s9, 61 -; GCN-NEXT: v_writelane_b32 v2, s10, 62 -; GCN-NEXT: v_writelane_b32 v2, s11, 63 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_writelane_b32 v0, s4, 56 +; GCN-NEXT: v_writelane_b32 v0, s5, 57 +; GCN-NEXT: v_writelane_b32 v0, s6, 58 +; GCN-NEXT: v_writelane_b32 v0, s7, 59 +; GCN-NEXT: v_writelane_b32 v0, s8, 60 +; GCN-NEXT: v_writelane_b32 v0, s9, 61 +; GCN-NEXT: v_writelane_b32 v0, s10, 62 +; GCN-NEXT: v_writelane_b32 v0, s11, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND @@ -200,192 +188,176 @@ ; GCN-NEXT: v_writelane_b32 v1, s9, 61 ; GCN-NEXT: v_writelane_b32 v1, s10, 62 ; GCN-NEXT: v_writelane_b32 v1, s11, 63 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_writelane_b32 v2, s4, 0 +; GCN-NEXT: v_writelane_b32 v2, s5, 1 +; GCN-NEXT: v_writelane_b32 v2, s6, 2 +; GCN-NEXT: v_writelane_b32 v2, s7, 3 +; GCN-NEXT: v_writelane_b32 v2, s8, 4 +; GCN-NEXT: v_writelane_b32 v2, s9, 5 +; GCN-NEXT: v_writelane_b32 v2, s10, 6 +; GCN-NEXT: v_writelane_b32 v2, s11, 7 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s8, v2, 56 -; GCN-NEXT: v_readlane_b32 s9, v2, 57 -; GCN-NEXT: v_readlane_b32 s10, v2, 58 -; GCN-NEXT: v_readlane_b32 s11, v2, 59 -; GCN-NEXT: v_readlane_b32 s12, v2, 60 -; GCN-NEXT: v_readlane_b32 s13, v2, 61 -; GCN-NEXT: v_readlane_b32 s14, v2, 62 -; GCN-NEXT: v_readlane_b32 s15, v2, 63 -; GCN-NEXT: v_readlane_b32 s16, v2, 48 -; GCN-NEXT: v_readlane_b32 s17, v2, 49 -; GCN-NEXT: v_readlane_b32 s18, v2, 50 -; GCN-NEXT: v_readlane_b32 s19, v2, 51 -; GCN-NEXT: v_readlane_b32 s20, v2, 52 -; GCN-NEXT: v_readlane_b32 s21, v2, 53 -; GCN-NEXT: v_readlane_b32 s22, v2, 54 -; GCN-NEXT: v_readlane_b32 s23, v2, 55 -; GCN-NEXT: v_readlane_b32 s24, v2, 40 -; GCN-NEXT: v_readlane_b32 s25, v2, 41 -; GCN-NEXT: v_readlane_b32 s26, v2, 42 -; GCN-NEXT: v_readlane_b32 s27, v2, 43 -; GCN-NEXT: v_readlane_b32 s28, v2, 44 -; GCN-NEXT: v_readlane_b32 s29, v2, 45 -; GCN-NEXT: v_readlane_b32 s30, v2, 46 -; GCN-NEXT: v_readlane_b32 s31, v2, 47 -; GCN-NEXT: v_readlane_b32 s36, v2, 32 -; GCN-NEXT: v_readlane_b32 s37, v2, 33 -; GCN-NEXT: v_readlane_b32 s38, v2, 34 -; GCN-NEXT: v_readlane_b32 s39, v2, 35 -; GCN-NEXT: v_readlane_b32 s40, v2, 36 -; GCN-NEXT: v_readlane_b32 s41, v2, 37 -; GCN-NEXT: v_readlane_b32 s42, v2, 38 -; GCN-NEXT: v_readlane_b32 s43, v2, 39 -; GCN-NEXT: v_readlane_b32 s44, v2, 24 -; GCN-NEXT: v_readlane_b32 s45, v2, 25 -; GCN-NEXT: v_readlane_b32 s46, v2, 26 -; GCN-NEXT: v_readlane_b32 s47, v2, 27 -; GCN-NEXT: v_readlane_b32 s48, v2, 28 -; GCN-NEXT: v_readlane_b32 s49, v2, 29 -; GCN-NEXT: v_readlane_b32 s50, v2, 30 -; GCN-NEXT: v_readlane_b32 s51, v2, 31 -; GCN-NEXT: v_readlane_b32 s52, v2, 16 -; GCN-NEXT: v_readlane_b32 s53, v2, 17 -; GCN-NEXT: v_readlane_b32 s54, v2, 18 -; GCN-NEXT: v_readlane_b32 s55, v2, 19 -; GCN-NEXT: v_readlane_b32 s56, v2, 20 -; GCN-NEXT: v_readlane_b32 s57, v2, 21 -; GCN-NEXT: v_readlane_b32 s58, v2, 22 -; GCN-NEXT: v_readlane_b32 s59, v2, 23 -; GCN-NEXT: v_readlane_b32 s60, v2, 8 -; GCN-NEXT: v_readlane_b32 s61, v2, 9 -; GCN-NEXT: v_readlane_b32 s62, v2, 10 -; GCN-NEXT: v_readlane_b32 s63, v2, 11 -; GCN-NEXT: v_readlane_b32 s64, v2, 12 -; GCN-NEXT: v_readlane_b32 s65, v2, 13 -; GCN-NEXT: v_readlane_b32 s66, v2, 14 -; GCN-NEXT: v_readlane_b32 s67, v2, 15 -; GCN-NEXT: v_readlane_b32 s68, v2, 0 -; GCN-NEXT: v_readlane_b32 s69, v2, 1 -; GCN-NEXT: v_readlane_b32 s70, v2, 2 -; GCN-NEXT: v_readlane_b32 s71, v2, 3 -; GCN-NEXT: v_readlane_b32 s72, v2, 4 -; GCN-NEXT: v_readlane_b32 s73, v2, 5 -; GCN-NEXT: v_readlane_b32 s74, v2, 6 -; GCN-NEXT: v_readlane_b32 s75, v2, 7 -; GCN-NEXT: v_readlane_b32 s76, v1, 56 -; GCN-NEXT: v_readlane_b32 s77, v1, 57 -; GCN-NEXT: v_readlane_b32 s78, v1, 58 -; GCN-NEXT: v_readlane_b32 s79, v1, 59 -; GCN-NEXT: v_readlane_b32 s80, v1, 60 -; GCN-NEXT: v_readlane_b32 s81, v1, 61 -; GCN-NEXT: v_readlane_b32 s82, v1, 62 -; GCN-NEXT: v_readlane_b32 s83, v1, 63 -; GCN-NEXT: v_readlane_b32 s84, v1, 48 -; GCN-NEXT: v_readlane_b32 s85, v1, 49 -; GCN-NEXT: v_readlane_b32 s86, v1, 50 -; GCN-NEXT: v_readlane_b32 s87, v1, 51 -; GCN-NEXT: v_readlane_b32 s88, v1, 52 -; GCN-NEXT: v_readlane_b32 s89, v1, 53 -; GCN-NEXT: v_readlane_b32 s90, v1, 54 -; GCN-NEXT: v_readlane_b32 s91, v1, 55 -; GCN-NEXT: v_readlane_b32 s0, v1, 0 -; GCN-NEXT: v_readlane_b32 s1, v1, 1 -; GCN-NEXT: v_readlane_b32 s2, v1, 2 -; GCN-NEXT: v_readlane_b32 s3, v1, 3 -; GCN-NEXT: v_readlane_b32 s4, v1, 4 -; GCN-NEXT: v_readlane_b32 s5, v1, 5 -; GCN-NEXT: v_readlane_b32 s6, v1, 6 -; GCN-NEXT: v_readlane_b32 s7, v1, 7 +; GCN-NEXT: v_readlane_b32 s8, v1, 56 +; GCN-NEXT: v_readlane_b32 s9, v1, 57 +; GCN-NEXT: v_readlane_b32 s10, v1, 58 +; GCN-NEXT: v_readlane_b32 s11, v1, 59 +; GCN-NEXT: v_readlane_b32 s12, v1, 60 +; GCN-NEXT: v_readlane_b32 s13, v1, 61 +; GCN-NEXT: v_readlane_b32 s14, v1, 62 +; GCN-NEXT: v_readlane_b32 s15, v1, 63 +; GCN-NEXT: v_readlane_b32 s16, v1, 48 +; GCN-NEXT: v_readlane_b32 s17, v1, 49 +; GCN-NEXT: v_readlane_b32 s18, v1, 50 +; GCN-NEXT: v_readlane_b32 s19, v1, 51 +; GCN-NEXT: v_readlane_b32 s20, v1, 52 +; GCN-NEXT: v_readlane_b32 s21, v1, 53 +; GCN-NEXT: v_readlane_b32 s22, v1, 54 +; GCN-NEXT: v_readlane_b32 s23, v1, 55 +; GCN-NEXT: v_readlane_b32 s24, v1, 40 +; GCN-NEXT: v_readlane_b32 s25, v1, 41 +; GCN-NEXT: v_readlane_b32 s26, v1, 42 +; GCN-NEXT: v_readlane_b32 s27, v1, 43 +; GCN-NEXT: v_readlane_b32 s28, v1, 44 +; GCN-NEXT: v_readlane_b32 s29, v1, 45 +; GCN-NEXT: v_readlane_b32 s30, v1, 46 +; GCN-NEXT: v_readlane_b32 s31, v1, 47 +; GCN-NEXT: v_readlane_b32 s36, v1, 32 +; GCN-NEXT: v_readlane_b32 s37, v1, 33 +; GCN-NEXT: v_readlane_b32 s38, v1, 34 +; GCN-NEXT: v_readlane_b32 s39, v1, 35 +; GCN-NEXT: v_readlane_b32 s40, v1, 36 +; GCN-NEXT: v_readlane_b32 s41, v1, 37 +; GCN-NEXT: v_readlane_b32 s42, v1, 38 +; GCN-NEXT: v_readlane_b32 s43, v1, 39 +; GCN-NEXT: v_readlane_b32 s44, v1, 24 +; GCN-NEXT: v_readlane_b32 s45, v1, 25 +; GCN-NEXT: v_readlane_b32 s46, v1, 26 +; GCN-NEXT: v_readlane_b32 s47, v1, 27 +; GCN-NEXT: v_readlane_b32 s48, v1, 28 +; GCN-NEXT: v_readlane_b32 s49, v1, 29 +; GCN-NEXT: v_readlane_b32 s50, v1, 30 +; GCN-NEXT: v_readlane_b32 s51, v1, 31 +; GCN-NEXT: v_readlane_b32 s52, v1, 16 +; GCN-NEXT: v_readlane_b32 s53, v1, 17 +; GCN-NEXT: v_readlane_b32 s54, v1, 18 +; GCN-NEXT: v_readlane_b32 s55, v1, 19 +; GCN-NEXT: v_readlane_b32 s56, v1, 20 +; GCN-NEXT: v_readlane_b32 s57, v1, 21 +; GCN-NEXT: v_readlane_b32 s58, v1, 22 +; GCN-NEXT: v_readlane_b32 s59, v1, 23 +; GCN-NEXT: v_readlane_b32 s60, v1, 8 +; GCN-NEXT: v_readlane_b32 s61, v1, 9 +; GCN-NEXT: v_readlane_b32 s62, v1, 10 +; GCN-NEXT: v_readlane_b32 s63, v1, 11 +; GCN-NEXT: v_readlane_b32 s64, v1, 12 +; GCN-NEXT: v_readlane_b32 s65, v1, 13 +; GCN-NEXT: v_readlane_b32 s66, v1, 14 +; GCN-NEXT: v_readlane_b32 s67, v1, 15 +; GCN-NEXT: v_readlane_b32 s68, v1, 0 +; GCN-NEXT: v_readlane_b32 s69, v1, 1 +; GCN-NEXT: v_readlane_b32 s70, v1, 2 +; GCN-NEXT: v_readlane_b32 s71, v1, 3 +; GCN-NEXT: v_readlane_b32 s72, v1, 4 +; GCN-NEXT: v_readlane_b32 s73, v1, 5 +; GCN-NEXT: v_readlane_b32 s74, v1, 6 +; GCN-NEXT: v_readlane_b32 s75, v1, 7 +; GCN-NEXT: v_readlane_b32 s76, v0, 56 +; GCN-NEXT: v_readlane_b32 s77, v0, 57 +; GCN-NEXT: v_readlane_b32 s78, v0, 58 +; GCN-NEXT: v_readlane_b32 s79, v0, 59 +; GCN-NEXT: v_readlane_b32 s80, v0, 60 +; GCN-NEXT: v_readlane_b32 s81, v0, 61 +; GCN-NEXT: v_readlane_b32 s82, v0, 62 +; GCN-NEXT: v_readlane_b32 s83, v0, 63 +; GCN-NEXT: v_readlane_b32 s84, v0, 48 +; GCN-NEXT: v_readlane_b32 s85, v0, 49 +; GCN-NEXT: v_readlane_b32 s86, v0, 50 +; GCN-NEXT: v_readlane_b32 s87, v0, 51 +; GCN-NEXT: v_readlane_b32 s88, v0, 52 +; GCN-NEXT: v_readlane_b32 s89, v0, 53 +; GCN-NEXT: v_readlane_b32 s90, v0, 54 +; GCN-NEXT: v_readlane_b32 s91, v0, 55 +; GCN-NEXT: v_readlane_b32 s0, v0, 0 +; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: v_readlane_b32 s2, v0, 2 +; GCN-NEXT: v_readlane_b32 s3, v0, 3 +; GCN-NEXT: v_readlane_b32 s4, v0, 4 +; GCN-NEXT: v_readlane_b32 s5, v0, 5 +; GCN-NEXT: v_readlane_b32 s6, v0, 6 +; GCN-NEXT: v_readlane_b32 s7, v0, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 8 -; GCN-NEXT: v_readlane_b32 s1, v1, 9 -; GCN-NEXT: v_readlane_b32 s2, v1, 10 -; GCN-NEXT: v_readlane_b32 s3, v1, 11 -; GCN-NEXT: v_readlane_b32 s4, v1, 12 -; GCN-NEXT: v_readlane_b32 s5, v1, 13 -; GCN-NEXT: v_readlane_b32 s6, v1, 14 -; GCN-NEXT: v_readlane_b32 s7, v1, 15 +; GCN-NEXT: v_readlane_b32 s0, v0, 8 +; GCN-NEXT: v_readlane_b32 s1, v0, 9 +; GCN-NEXT: v_readlane_b32 s2, v0, 10 +; GCN-NEXT: v_readlane_b32 s3, v0, 11 +; GCN-NEXT: v_readlane_b32 s4, v0, 12 +; GCN-NEXT: v_readlane_b32 s5, v0, 13 +; GCN-NEXT: v_readlane_b32 s6, v0, 14 +; GCN-NEXT: v_readlane_b32 s7, v0, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 16 -; GCN-NEXT: v_readlane_b32 s1, v1, 17 -; GCN-NEXT: v_readlane_b32 s2, v1, 18 -; GCN-NEXT: v_readlane_b32 s3, v1, 19 -; GCN-NEXT: v_readlane_b32 s4, v1, 20 -; GCN-NEXT: v_readlane_b32 s5, v1, 21 -; GCN-NEXT: v_readlane_b32 s6, v1, 22 -; GCN-NEXT: v_readlane_b32 s7, v1, 23 +; GCN-NEXT: v_readlane_b32 s0, v0, 16 +; GCN-NEXT: v_readlane_b32 s1, v0, 17 +; GCN-NEXT: v_readlane_b32 s2, v0, 18 +; GCN-NEXT: v_readlane_b32 s3, v0, 19 +; GCN-NEXT: v_readlane_b32 s4, v0, 20 +; GCN-NEXT: v_readlane_b32 s5, v0, 21 +; GCN-NEXT: v_readlane_b32 s6, v0, 22 +; GCN-NEXT: v_readlane_b32 s7, v0, 23 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 24 -; GCN-NEXT: v_readlane_b32 s1, v1, 25 -; GCN-NEXT: v_readlane_b32 s2, v1, 26 -; GCN-NEXT: v_readlane_b32 s3, v1, 27 -; GCN-NEXT: v_readlane_b32 s4, v1, 28 -; GCN-NEXT: v_readlane_b32 s5, v1, 29 -; GCN-NEXT: v_readlane_b32 s6, v1, 30 -; GCN-NEXT: v_readlane_b32 s7, v1, 31 +; GCN-NEXT: v_readlane_b32 s0, v0, 24 +; GCN-NEXT: v_readlane_b32 s1, v0, 25 +; GCN-NEXT: v_readlane_b32 s2, v0, 26 +; GCN-NEXT: v_readlane_b32 s3, v0, 27 +; GCN-NEXT: v_readlane_b32 s4, v0, 28 +; GCN-NEXT: v_readlane_b32 s5, v0, 29 +; GCN-NEXT: v_readlane_b32 s6, v0, 30 +; GCN-NEXT: v_readlane_b32 s7, v0, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 32 -; GCN-NEXT: v_readlane_b32 s1, v1, 33 -; GCN-NEXT: v_readlane_b32 s2, v1, 34 -; GCN-NEXT: v_readlane_b32 s3, v1, 35 -; GCN-NEXT: v_readlane_b32 s4, v1, 36 -; GCN-NEXT: v_readlane_b32 s5, v1, 37 -; GCN-NEXT: v_readlane_b32 s6, v1, 38 -; GCN-NEXT: v_readlane_b32 s7, v1, 39 +; GCN-NEXT: v_readlane_b32 s0, v0, 32 +; GCN-NEXT: v_readlane_b32 s1, v0, 33 +; GCN-NEXT: v_readlane_b32 s2, v0, 34 +; GCN-NEXT: v_readlane_b32 s3, v0, 35 +; GCN-NEXT: v_readlane_b32 s4, v0, 36 +; GCN-NEXT: v_readlane_b32 s5, v0, 37 +; GCN-NEXT: v_readlane_b32 s6, v0, 38 +; GCN-NEXT: v_readlane_b32 s7, v0, 39 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v1, 40 -; GCN-NEXT: v_readlane_b32 s1, v1, 41 -; GCN-NEXT: v_readlane_b32 s2, v1, 42 -; GCN-NEXT: v_readlane_b32 s3, v1, 43 -; GCN-NEXT: v_readlane_b32 s4, v1, 44 -; GCN-NEXT: v_readlane_b32 s5, v1, 45 -; GCN-NEXT: v_readlane_b32 s6, v1, 46 -; GCN-NEXT: v_readlane_b32 s7, v1, 47 +; GCN-NEXT: v_readlane_b32 s0, v0, 40 +; GCN-NEXT: v_readlane_b32 s1, v0, 41 +; GCN-NEXT: v_readlane_b32 s2, v0, 42 +; GCN-NEXT: v_readlane_b32 s3, v0, 43 +; GCN-NEXT: v_readlane_b32 s4, v0, 44 +; GCN-NEXT: v_readlane_b32 s5, v0, 45 +; GCN-NEXT: v_readlane_b32 s6, v0, 46 +; GCN-NEXT: v_readlane_b32 s7, v0, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s0, v0, 0 -; GCN-NEXT: v_readlane_b32 s1, v0, 1 -; GCN-NEXT: v_readlane_b32 s2, v0, 2 -; GCN-NEXT: v_readlane_b32 s3, v0, 3 -; GCN-NEXT: v_readlane_b32 s4, v0, 4 -; GCN-NEXT: v_readlane_b32 s5, v0, 5 -; GCN-NEXT: v_readlane_b32 s6, v0, 6 -; GCN-NEXT: v_readlane_b32 s7, v0, 7 +; GCN-NEXT: v_readlane_b32 s0, v2, 0 +; GCN-NEXT: v_readlane_b32 s1, v2, 1 +; GCN-NEXT: v_readlane_b32 s2, v2, 2 +; GCN-NEXT: v_readlane_b32 s3, v2, 3 +; GCN-NEXT: v_readlane_b32 s4, v2, 4 +; GCN-NEXT: v_readlane_b32 s5, v2, 5 +; GCN-NEXT: v_readlane_b32 s6, v2, 6 +; GCN-NEXT: v_readlane_b32 s7, v2, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[84:91] ; GCN-NEXT: ;;#ASMEND @@ -420,18 +392,6 @@ ; GCN-NEXT: ; use s[0:7] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB0_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[92:95], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[92:95], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[92:95], 0 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; kill: killed $vgpr2 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm %wide.sgpr0 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <8 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -482,125 +442,104 @@ define amdgpu_kernel void @split_sgpr_spill_2_vgprs(ptr addrspace(1) %out, i32 %in) #1 { ; GCN-LABEL: split_sgpr_spill_2_vgprs: ; GCN: ; %bb.0: -; GCN-NEXT: s_mov_b32 s52, SCRATCH_RSRC_DWORD0 -; GCN-NEXT: s_mov_b32 s53, SCRATCH_RSRC_DWORD1 -; GCN-NEXT: s_mov_b32 s54, -1 -; GCN-NEXT: s_mov_b32 s55, 0xe8f000 -; GCN-NEXT: s_add_u32 s52, s52, s11 -; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr1 ; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 -; GCN-NEXT: v_writelane_b32 v1, s12, 8 -; GCN-NEXT: v_writelane_b32 v1, s13, 9 -; GCN-NEXT: v_writelane_b32 v1, s14, 10 -; GCN-NEXT: v_writelane_b32 v1, s15, 11 -; GCN-NEXT: v_writelane_b32 v1, s16, 12 -; GCN-NEXT: v_writelane_b32 v1, s17, 13 -; GCN-NEXT: v_writelane_b32 v1, s18, 14 -; GCN-NEXT: v_writelane_b32 v1, s19, 15 +; GCN-NEXT: v_writelane_b32 v0, s4, 0 +; GCN-NEXT: v_writelane_b32 v0, s5, 1 +; GCN-NEXT: v_writelane_b32 v0, s6, 2 +; GCN-NEXT: v_writelane_b32 v0, s7, 3 +; GCN-NEXT: v_writelane_b32 v0, s8, 4 +; GCN-NEXT: v_writelane_b32 v0, s9, 5 +; GCN-NEXT: v_writelane_b32 v0, s10, 6 +; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v0, s12, 8 +; GCN-NEXT: v_writelane_b32 v0, s13, 9 +; GCN-NEXT: v_writelane_b32 v0, s14, 10 +; GCN-NEXT: v_writelane_b32 v0, s15, 11 +; GCN-NEXT: v_writelane_b32 v0, s16, 12 +; GCN-NEXT: v_writelane_b32 v0, s17, 13 +; GCN-NEXT: v_writelane_b32 v0, s18, 14 +; GCN-NEXT: v_writelane_b32 v0, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 -; GCN-NEXT: v_writelane_b32 v1, s12, 24 -; GCN-NEXT: v_writelane_b32 v1, s13, 25 -; GCN-NEXT: v_writelane_b32 v1, s14, 26 -; GCN-NEXT: v_writelane_b32 v1, s15, 27 -; GCN-NEXT: v_writelane_b32 v1, s16, 28 -; GCN-NEXT: v_writelane_b32 v1, s17, 29 -; GCN-NEXT: v_writelane_b32 v1, s18, 30 -; GCN-NEXT: v_writelane_b32 v1, s19, 31 +; GCN-NEXT: v_writelane_b32 v0, s4, 16 +; GCN-NEXT: v_writelane_b32 v0, s5, 17 +; GCN-NEXT: v_writelane_b32 v0, s6, 18 +; GCN-NEXT: v_writelane_b32 v0, s7, 19 +; GCN-NEXT: v_writelane_b32 v0, s8, 20 +; GCN-NEXT: v_writelane_b32 v0, s9, 21 +; GCN-NEXT: v_writelane_b32 v0, s10, 22 +; GCN-NEXT: v_writelane_b32 v0, s11, 23 +; GCN-NEXT: v_writelane_b32 v0, s12, 24 +; GCN-NEXT: v_writelane_b32 v0, s13, 25 +; GCN-NEXT: v_writelane_b32 v0, s14, 26 +; GCN-NEXT: v_writelane_b32 v0, s15, 27 +; GCN-NEXT: v_writelane_b32 v0, s16, 28 +; GCN-NEXT: v_writelane_b32 v0, s17, 29 +; GCN-NEXT: v_writelane_b32 v0, s18, 30 +; GCN-NEXT: v_writelane_b32 v0, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 -; GCN-NEXT: v_writelane_b32 v1, s12, 40 -; GCN-NEXT: v_writelane_b32 v1, s13, 41 -; GCN-NEXT: v_writelane_b32 v1, s14, 42 -; GCN-NEXT: v_writelane_b32 v1, s15, 43 -; GCN-NEXT: v_writelane_b32 v1, s16, 44 -; GCN-NEXT: v_writelane_b32 v1, s17, 45 -; GCN-NEXT: v_writelane_b32 v1, s18, 46 -; GCN-NEXT: v_writelane_b32 v1, s19, 47 +; GCN-NEXT: v_writelane_b32 v0, s4, 32 +; GCN-NEXT: v_writelane_b32 v0, s5, 33 +; GCN-NEXT: v_writelane_b32 v0, s6, 34 +; GCN-NEXT: v_writelane_b32 v0, s7, 35 +; GCN-NEXT: v_writelane_b32 v0, s8, 36 +; GCN-NEXT: v_writelane_b32 v0, s9, 37 +; GCN-NEXT: v_writelane_b32 v0, s10, 38 +; GCN-NEXT: v_writelane_b32 v0, s11, 39 +; GCN-NEXT: v_writelane_b32 v0, s12, 40 +; GCN-NEXT: v_writelane_b32 v0, s13, 41 +; GCN-NEXT: v_writelane_b32 v0, s14, 42 +; GCN-NEXT: v_writelane_b32 v0, s15, 43 +; GCN-NEXT: v_writelane_b32 v0, s16, 44 +; GCN-NEXT: v_writelane_b32 v0, s17, 45 +; GCN-NEXT: v_writelane_b32 v0, s18, 46 +; GCN-NEXT: v_writelane_b32 v0, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 -; GCN-NEXT: v_writelane_b32 v1, s12, 56 -; GCN-NEXT: v_writelane_b32 v1, s13, 57 -; GCN-NEXT: v_writelane_b32 v1, s14, 58 -; GCN-NEXT: v_writelane_b32 v1, s15, 59 -; GCN-NEXT: v_writelane_b32 v1, s16, 60 -; GCN-NEXT: v_writelane_b32 v1, s17, 61 -; GCN-NEXT: v_writelane_b32 v1, s18, 62 -; GCN-NEXT: v_writelane_b32 v1, s19, 63 -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: v_writelane_b32 v0, s4, 48 +; GCN-NEXT: v_writelane_b32 v0, s5, 49 +; GCN-NEXT: v_writelane_b32 v0, s6, 50 +; GCN-NEXT: v_writelane_b32 v0, s7, 51 +; GCN-NEXT: v_writelane_b32 v0, s8, 52 +; GCN-NEXT: v_writelane_b32 v0, s9, 53 +; GCN-NEXT: v_writelane_b32 v0, s10, 54 +; GCN-NEXT: v_writelane_b32 v0, s11, 55 +; GCN-NEXT: v_writelane_b32 v0, s12, 56 +; GCN-NEXT: v_writelane_b32 v0, s13, 57 +; GCN-NEXT: v_writelane_b32 v0, s14, 58 +; GCN-NEXT: v_writelane_b32 v0, s15, 59 +; GCN-NEXT: v_writelane_b32 v0, s16, 60 +; GCN-NEXT: v_writelane_b32 v0, s17, 61 +; GCN-NEXT: v_writelane_b32 v0, s18, 62 +; GCN-NEXT: v_writelane_b32 v0, s19, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:11] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: v_writelane_b32 v0, s5, 1 -; GCN-NEXT: v_writelane_b32 v0, s6, 2 -; GCN-NEXT: v_writelane_b32 v0, s7, 3 -; GCN-NEXT: v_writelane_b32 v0, s8, 4 -; GCN-NEXT: v_writelane_b32 v0, s9, 5 -; GCN-NEXT: v_writelane_b32 v0, s10, 6 -; GCN-NEXT: v_writelane_b32 v0, s11, 7 +; GCN-NEXT: v_writelane_b32 v1, s4, 0 +; GCN-NEXT: v_writelane_b32 v1, s5, 1 +; GCN-NEXT: v_writelane_b32 v1, s6, 2 +; GCN-NEXT: v_writelane_b32 v1, s7, 3 +; GCN-NEXT: v_writelane_b32 v1, s8, 4 +; GCN-NEXT: v_writelane_b32 v1, s9, 5 +; GCN-NEXT: v_writelane_b32 v1, s10, 6 +; GCN-NEXT: v_writelane_b32 v1, s11, 7 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s2, 8 -; GCN-NEXT: v_writelane_b32 v0, s3, 9 -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[28:29] +; GCN-NEXT: v_writelane_b32 v1, s2, 8 +; GCN-NEXT: v_writelane_b32 v1, s3, 9 ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB1_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s16, v1, 8 ; GCN-NEXT: v_readlane_b32 s17, v1, 9 ; GCN-NEXT: v_readlane_b32 s20, v1, 0 @@ -694,14 +633,6 @@ ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB1_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm %wide.sgpr0 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 %wide.sgpr1 = call <16 x i32> asm sideeffect "; def $0", "=s" () #0 @@ -738,17 +669,9 @@ ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s11 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_load_dword s0, s[4:5], 0xb ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -762,176 +685,176 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 -; GCN-NEXT: v_writelane_b32 v1, s12, 8 -; GCN-NEXT: v_writelane_b32 v1, s13, 9 -; GCN-NEXT: v_writelane_b32 v1, s14, 10 -; GCN-NEXT: v_writelane_b32 v1, s15, 11 -; GCN-NEXT: v_writelane_b32 v1, s16, 12 -; GCN-NEXT: v_writelane_b32 v1, s17, 13 -; GCN-NEXT: v_writelane_b32 v1, s18, 14 -; GCN-NEXT: v_writelane_b32 v1, s19, 15 +; GCN-NEXT: v_writelane_b32 v31, s4, 0 +; GCN-NEXT: v_writelane_b32 v31, s5, 1 +; GCN-NEXT: v_writelane_b32 v31, s6, 2 +; GCN-NEXT: v_writelane_b32 v31, s7, 3 +; GCN-NEXT: v_writelane_b32 v31, s8, 4 +; GCN-NEXT: v_writelane_b32 v31, s9, 5 +; GCN-NEXT: v_writelane_b32 v31, s10, 6 +; GCN-NEXT: v_writelane_b32 v31, s11, 7 +; GCN-NEXT: v_writelane_b32 v31, s12, 8 +; GCN-NEXT: v_writelane_b32 v31, s13, 9 +; GCN-NEXT: v_writelane_b32 v31, s14, 10 +; GCN-NEXT: v_writelane_b32 v31, s15, 11 +; GCN-NEXT: v_writelane_b32 v31, s16, 12 +; GCN-NEXT: v_writelane_b32 v31, s17, 13 +; GCN-NEXT: v_writelane_b32 v31, s18, 14 +; GCN-NEXT: v_writelane_b32 v31, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 -; GCN-NEXT: v_writelane_b32 v1, s12, 24 -; GCN-NEXT: v_writelane_b32 v1, s13, 25 -; GCN-NEXT: v_writelane_b32 v1, s14, 26 -; GCN-NEXT: v_writelane_b32 v1, s15, 27 -; GCN-NEXT: v_writelane_b32 v1, s16, 28 -; GCN-NEXT: v_writelane_b32 v1, s17, 29 -; GCN-NEXT: v_writelane_b32 v1, s18, 30 -; GCN-NEXT: v_writelane_b32 v1, s19, 31 +; GCN-NEXT: v_writelane_b32 v31, s4, 16 +; GCN-NEXT: v_writelane_b32 v31, s5, 17 +; GCN-NEXT: v_writelane_b32 v31, s6, 18 +; GCN-NEXT: v_writelane_b32 v31, s7, 19 +; GCN-NEXT: v_writelane_b32 v31, s8, 20 +; GCN-NEXT: v_writelane_b32 v31, s9, 21 +; GCN-NEXT: v_writelane_b32 v31, s10, 22 +; GCN-NEXT: v_writelane_b32 v31, s11, 23 +; GCN-NEXT: v_writelane_b32 v31, s12, 24 +; GCN-NEXT: v_writelane_b32 v31, s13, 25 +; GCN-NEXT: v_writelane_b32 v31, s14, 26 +; GCN-NEXT: v_writelane_b32 v31, s15, 27 +; GCN-NEXT: v_writelane_b32 v31, s16, 28 +; GCN-NEXT: v_writelane_b32 v31, s17, 29 +; GCN-NEXT: v_writelane_b32 v31, s18, 30 +; GCN-NEXT: v_writelane_b32 v31, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 -; GCN-NEXT: v_writelane_b32 v1, s12, 40 -; GCN-NEXT: v_writelane_b32 v1, s13, 41 -; GCN-NEXT: v_writelane_b32 v1, s14, 42 -; GCN-NEXT: v_writelane_b32 v1, s15, 43 -; GCN-NEXT: v_writelane_b32 v1, s16, 44 -; GCN-NEXT: v_writelane_b32 v1, s17, 45 -; GCN-NEXT: v_writelane_b32 v1, s18, 46 -; GCN-NEXT: v_writelane_b32 v1, s19, 47 +; GCN-NEXT: v_writelane_b32 v31, s4, 32 +; GCN-NEXT: v_writelane_b32 v31, s5, 33 +; GCN-NEXT: v_writelane_b32 v31, s6, 34 +; GCN-NEXT: v_writelane_b32 v31, s7, 35 +; GCN-NEXT: v_writelane_b32 v31, s8, 36 +; GCN-NEXT: v_writelane_b32 v31, s9, 37 +; GCN-NEXT: v_writelane_b32 v31, s10, 38 +; GCN-NEXT: v_writelane_b32 v31, s11, 39 +; GCN-NEXT: v_writelane_b32 v31, s12, 40 +; GCN-NEXT: v_writelane_b32 v31, s13, 41 +; GCN-NEXT: v_writelane_b32 v31, s14, 42 +; GCN-NEXT: v_writelane_b32 v31, s15, 43 +; GCN-NEXT: v_writelane_b32 v31, s16, 44 +; GCN-NEXT: v_writelane_b32 v31, s17, 45 +; GCN-NEXT: v_writelane_b32 v31, s18, 46 +; GCN-NEXT: v_writelane_b32 v31, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 -; GCN-NEXT: v_writelane_b32 v1, s12, 56 -; GCN-NEXT: v_writelane_b32 v1, s13, 57 -; GCN-NEXT: v_writelane_b32 v1, s14, 58 -; GCN-NEXT: v_writelane_b32 v1, s15, 59 -; GCN-NEXT: v_writelane_b32 v1, s16, 60 -; GCN-NEXT: v_writelane_b32 v1, s17, 61 -; GCN-NEXT: v_writelane_b32 v1, s18, 62 -; GCN-NEXT: v_writelane_b32 v1, s19, 63 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_writelane_b32 v31, s4, 48 +; GCN-NEXT: v_writelane_b32 v31, s5, 49 +; GCN-NEXT: v_writelane_b32 v31, s6, 50 +; GCN-NEXT: v_writelane_b32 v31, s7, 51 +; GCN-NEXT: v_writelane_b32 v31, s8, 52 +; GCN-NEXT: v_writelane_b32 v31, s9, 53 +; GCN-NEXT: v_writelane_b32 v31, s10, 54 +; GCN-NEXT: v_writelane_b32 v31, s11, 55 +; GCN-NEXT: v_writelane_b32 v31, s12, 56 +; GCN-NEXT: v_writelane_b32 v31, s13, 57 +; GCN-NEXT: v_writelane_b32 v31, s14, 58 +; GCN-NEXT: v_writelane_b32 v31, s15, 59 +; GCN-NEXT: v_writelane_b32 v31, s16, 60 +; GCN-NEXT: v_writelane_b32 v31, s17, 61 +; GCN-NEXT: v_writelane_b32 v31, s18, 62 +; GCN-NEXT: v_writelane_b32 v31, s19, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB2_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s36, v1, 32 -; GCN-NEXT: v_readlane_b32 s37, v1, 33 -; GCN-NEXT: v_readlane_b32 s38, v1, 34 -; GCN-NEXT: v_readlane_b32 s39, v1, 35 -; GCN-NEXT: v_readlane_b32 s40, v1, 36 -; GCN-NEXT: v_readlane_b32 s41, v1, 37 -; GCN-NEXT: v_readlane_b32 s42, v1, 38 -; GCN-NEXT: v_readlane_b32 s43, v1, 39 -; GCN-NEXT: v_readlane_b32 s44, v1, 40 -; GCN-NEXT: v_readlane_b32 s45, v1, 41 -; GCN-NEXT: v_readlane_b32 s46, v1, 42 -; GCN-NEXT: v_readlane_b32 s47, v1, 43 -; GCN-NEXT: v_readlane_b32 s48, v1, 44 -; GCN-NEXT: v_readlane_b32 s49, v1, 45 -; GCN-NEXT: v_readlane_b32 s50, v1, 46 -; GCN-NEXT: v_readlane_b32 s51, v1, 47 -; GCN-NEXT: v_readlane_b32 s0, v1, 16 -; GCN-NEXT: v_readlane_b32 s1, v1, 17 -; GCN-NEXT: v_readlane_b32 s2, v1, 18 -; GCN-NEXT: v_readlane_b32 s3, v1, 19 -; GCN-NEXT: v_readlane_b32 s4, v1, 20 -; GCN-NEXT: v_readlane_b32 s5, v1, 21 -; GCN-NEXT: v_readlane_b32 s6, v1, 22 -; GCN-NEXT: v_readlane_b32 s7, v1, 23 -; GCN-NEXT: v_readlane_b32 s8, v1, 24 -; GCN-NEXT: v_readlane_b32 s9, v1, 25 -; GCN-NEXT: v_readlane_b32 s10, v1, 26 -; GCN-NEXT: v_readlane_b32 s11, v1, 27 -; GCN-NEXT: v_readlane_b32 s12, v1, 28 -; GCN-NEXT: v_readlane_b32 s13, v1, 29 -; GCN-NEXT: v_readlane_b32 s14, v1, 30 -; GCN-NEXT: v_readlane_b32 s15, v1, 31 -; GCN-NEXT: v_readlane_b32 s16, v1, 0 -; GCN-NEXT: v_readlane_b32 s17, v1, 1 -; GCN-NEXT: v_readlane_b32 s18, v1, 2 -; GCN-NEXT: v_readlane_b32 s19, v1, 3 -; GCN-NEXT: v_readlane_b32 s20, v1, 4 -; GCN-NEXT: v_readlane_b32 s21, v1, 5 -; GCN-NEXT: v_readlane_b32 s22, v1, 6 -; GCN-NEXT: v_readlane_b32 s23, v1, 7 -; GCN-NEXT: v_readlane_b32 s24, v1, 8 -; GCN-NEXT: v_readlane_b32 s25, v1, 9 -; GCN-NEXT: v_readlane_b32 s26, v1, 10 -; GCN-NEXT: v_readlane_b32 s27, v1, 11 -; GCN-NEXT: v_readlane_b32 s28, v1, 12 -; GCN-NEXT: v_readlane_b32 s29, v1, 13 -; GCN-NEXT: v_readlane_b32 s30, v1, 14 -; GCN-NEXT: v_readlane_b32 s31, v1, 15 +; GCN-NEXT: v_readlane_b32 s36, v31, 32 +; GCN-NEXT: v_readlane_b32 s37, v31, 33 +; GCN-NEXT: v_readlane_b32 s38, v31, 34 +; GCN-NEXT: v_readlane_b32 s39, v31, 35 +; GCN-NEXT: v_readlane_b32 s40, v31, 36 +; GCN-NEXT: v_readlane_b32 s41, v31, 37 +; GCN-NEXT: v_readlane_b32 s42, v31, 38 +; GCN-NEXT: v_readlane_b32 s43, v31, 39 +; GCN-NEXT: v_readlane_b32 s44, v31, 40 +; GCN-NEXT: v_readlane_b32 s45, v31, 41 +; GCN-NEXT: v_readlane_b32 s46, v31, 42 +; GCN-NEXT: v_readlane_b32 s47, v31, 43 +; GCN-NEXT: v_readlane_b32 s48, v31, 44 +; GCN-NEXT: v_readlane_b32 s49, v31, 45 +; GCN-NEXT: v_readlane_b32 s50, v31, 46 +; GCN-NEXT: v_readlane_b32 s51, v31, 47 +; GCN-NEXT: v_readlane_b32 s0, v31, 16 +; GCN-NEXT: v_readlane_b32 s1, v31, 17 +; GCN-NEXT: v_readlane_b32 s2, v31, 18 +; GCN-NEXT: v_readlane_b32 s3, v31, 19 +; GCN-NEXT: v_readlane_b32 s4, v31, 20 +; GCN-NEXT: v_readlane_b32 s5, v31, 21 +; GCN-NEXT: v_readlane_b32 s6, v31, 22 +; GCN-NEXT: v_readlane_b32 s7, v31, 23 +; GCN-NEXT: v_readlane_b32 s8, v31, 24 +; GCN-NEXT: v_readlane_b32 s9, v31, 25 +; GCN-NEXT: v_readlane_b32 s10, v31, 26 +; GCN-NEXT: v_readlane_b32 s11, v31, 27 +; GCN-NEXT: v_readlane_b32 s12, v31, 28 +; GCN-NEXT: v_readlane_b32 s13, v31, 29 +; GCN-NEXT: v_readlane_b32 s14, v31, 30 +; GCN-NEXT: v_readlane_b32 s15, v31, 31 +; GCN-NEXT: v_readlane_b32 s16, v31, 0 +; GCN-NEXT: v_readlane_b32 s17, v31, 1 +; GCN-NEXT: v_readlane_b32 s18, v31, 2 +; GCN-NEXT: v_readlane_b32 s19, v31, 3 +; GCN-NEXT: v_readlane_b32 s20, v31, 4 +; GCN-NEXT: v_readlane_b32 s21, v31, 5 +; GCN-NEXT: v_readlane_b32 s22, v31, 6 +; GCN-NEXT: v_readlane_b32 s23, v31, 7 +; GCN-NEXT: v_readlane_b32 s24, v31, 8 +; GCN-NEXT: v_readlane_b32 s25, v31, 9 +; GCN-NEXT: v_readlane_b32 s26, v31, 10 +; GCN-NEXT: v_readlane_b32 s27, v31, 11 +; GCN-NEXT: v_readlane_b32 s28, v31, 12 +; GCN-NEXT: v_readlane_b32 s29, v31, 13 +; GCN-NEXT: v_readlane_b32 s30, v31, 14 +; GCN-NEXT: v_readlane_b32 s31, v31, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[16:31] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v1, 48 -; GCN-NEXT: v_readlane_b32 s5, v1, 49 -; GCN-NEXT: v_readlane_b32 s6, v1, 50 -; GCN-NEXT: v_readlane_b32 s7, v1, 51 -; GCN-NEXT: v_readlane_b32 s8, v1, 52 -; GCN-NEXT: v_readlane_b32 s9, v1, 53 -; GCN-NEXT: v_readlane_b32 s10, v1, 54 -; GCN-NEXT: v_readlane_b32 s11, v1, 55 -; GCN-NEXT: v_readlane_b32 s12, v1, 56 -; GCN-NEXT: v_readlane_b32 s13, v1, 57 -; GCN-NEXT: v_readlane_b32 s14, v1, 58 -; GCN-NEXT: v_readlane_b32 s15, v1, 59 -; GCN-NEXT: v_readlane_b32 s16, v1, 60 -; GCN-NEXT: v_readlane_b32 s17, v1, 61 -; GCN-NEXT: v_readlane_b32 s18, v1, 62 -; GCN-NEXT: v_readlane_b32 s19, v1, 63 +; GCN-NEXT: v_readlane_b32 s4, v31, 48 +; GCN-NEXT: v_readlane_b32 s5, v31, 49 +; GCN-NEXT: v_readlane_b32 s6, v31, 50 +; GCN-NEXT: v_readlane_b32 s7, v31, 51 +; GCN-NEXT: v_readlane_b32 s8, v31, 52 +; GCN-NEXT: v_readlane_b32 s9, v31, 53 +; GCN-NEXT: v_readlane_b32 s10, v31, 54 +; GCN-NEXT: v_readlane_b32 s11, v31, 55 +; GCN-NEXT: v_readlane_b32 s12, v31, 56 +; GCN-NEXT: v_readlane_b32 s13, v31, 57 +; GCN-NEXT: v_readlane_b32 s14, v31, 58 +; GCN-NEXT: v_readlane_b32 s15, v31, 59 +; GCN-NEXT: v_readlane_b32 s16, v31, 60 +; GCN-NEXT: v_readlane_b32 s17, v31, 61 +; GCN-NEXT: v_readlane_b32 s18, v31, 62 +; GCN-NEXT: v_readlane_b32 s19, v31, 63 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s0, v0, 0 ; GCN-NEXT: v_readlane_b32 s1, v0, 1 +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -942,14 +865,6 @@ ; GCN-NEXT: ; use s[0:1] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB2_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 @@ -989,17 +904,9 @@ ; GCN-NEXT: s_mov_b32 s55, 0xe8f000 ; GCN-NEXT: s_add_u32 s52, s52, s11 ; GCN-NEXT: s_addc_u32 s53, s53, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_load_dword s0, s[4:5], 0x9 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -1013,152 +920,144 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v1, s4, 0 -; GCN-NEXT: v_writelane_b32 v1, s5, 1 -; GCN-NEXT: v_writelane_b32 v1, s6, 2 -; GCN-NEXT: v_writelane_b32 v1, s7, 3 -; GCN-NEXT: v_writelane_b32 v1, s8, 4 -; GCN-NEXT: v_writelane_b32 v1, s9, 5 -; GCN-NEXT: v_writelane_b32 v1, s10, 6 -; GCN-NEXT: v_writelane_b32 v1, s11, 7 -; GCN-NEXT: v_writelane_b32 v1, s12, 8 -; GCN-NEXT: v_writelane_b32 v1, s13, 9 -; GCN-NEXT: v_writelane_b32 v1, s14, 10 -; GCN-NEXT: v_writelane_b32 v1, s15, 11 -; GCN-NEXT: v_writelane_b32 v1, s16, 12 -; GCN-NEXT: v_writelane_b32 v1, s17, 13 -; GCN-NEXT: v_writelane_b32 v1, s18, 14 -; GCN-NEXT: v_writelane_b32 v1, s19, 15 +; GCN-NEXT: v_writelane_b32 v31, s4, 0 +; GCN-NEXT: v_writelane_b32 v31, s5, 1 +; GCN-NEXT: v_writelane_b32 v31, s6, 2 +; GCN-NEXT: v_writelane_b32 v31, s7, 3 +; GCN-NEXT: v_writelane_b32 v31, s8, 4 +; GCN-NEXT: v_writelane_b32 v31, s9, 5 +; GCN-NEXT: v_writelane_b32 v31, s10, 6 +; GCN-NEXT: v_writelane_b32 v31, s11, 7 +; GCN-NEXT: v_writelane_b32 v31, s12, 8 +; GCN-NEXT: v_writelane_b32 v31, s13, 9 +; GCN-NEXT: v_writelane_b32 v31, s14, 10 +; GCN-NEXT: v_writelane_b32 v31, s15, 11 +; GCN-NEXT: v_writelane_b32 v31, s16, 12 +; GCN-NEXT: v_writelane_b32 v31, s17, 13 +; GCN-NEXT: v_writelane_b32 v31, s18, 14 +; GCN-NEXT: v_writelane_b32 v31, s19, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 16 -; GCN-NEXT: v_writelane_b32 v1, s5, 17 -; GCN-NEXT: v_writelane_b32 v1, s6, 18 -; GCN-NEXT: v_writelane_b32 v1, s7, 19 -; GCN-NEXT: v_writelane_b32 v1, s8, 20 -; GCN-NEXT: v_writelane_b32 v1, s9, 21 -; GCN-NEXT: v_writelane_b32 v1, s10, 22 -; GCN-NEXT: v_writelane_b32 v1, s11, 23 -; GCN-NEXT: v_writelane_b32 v1, s12, 24 -; GCN-NEXT: v_writelane_b32 v1, s13, 25 -; GCN-NEXT: v_writelane_b32 v1, s14, 26 -; GCN-NEXT: v_writelane_b32 v1, s15, 27 -; GCN-NEXT: v_writelane_b32 v1, s16, 28 -; GCN-NEXT: v_writelane_b32 v1, s17, 29 -; GCN-NEXT: v_writelane_b32 v1, s18, 30 -; GCN-NEXT: v_writelane_b32 v1, s19, 31 +; GCN-NEXT: v_writelane_b32 v31, s4, 16 +; GCN-NEXT: v_writelane_b32 v31, s5, 17 +; GCN-NEXT: v_writelane_b32 v31, s6, 18 +; GCN-NEXT: v_writelane_b32 v31, s7, 19 +; GCN-NEXT: v_writelane_b32 v31, s8, 20 +; GCN-NEXT: v_writelane_b32 v31, s9, 21 +; GCN-NEXT: v_writelane_b32 v31, s10, 22 +; GCN-NEXT: v_writelane_b32 v31, s11, 23 +; GCN-NEXT: v_writelane_b32 v31, s12, 24 +; GCN-NEXT: v_writelane_b32 v31, s13, 25 +; GCN-NEXT: v_writelane_b32 v31, s14, 26 +; GCN-NEXT: v_writelane_b32 v31, s15, 27 +; GCN-NEXT: v_writelane_b32 v31, s16, 28 +; GCN-NEXT: v_writelane_b32 v31, s17, 29 +; GCN-NEXT: v_writelane_b32 v31, s18, 30 +; GCN-NEXT: v_writelane_b32 v31, s19, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 32 -; GCN-NEXT: v_writelane_b32 v1, s5, 33 -; GCN-NEXT: v_writelane_b32 v1, s6, 34 -; GCN-NEXT: v_writelane_b32 v1, s7, 35 -; GCN-NEXT: v_writelane_b32 v1, s8, 36 -; GCN-NEXT: v_writelane_b32 v1, s9, 37 -; GCN-NEXT: v_writelane_b32 v1, s10, 38 -; GCN-NEXT: v_writelane_b32 v1, s11, 39 -; GCN-NEXT: v_writelane_b32 v1, s12, 40 -; GCN-NEXT: v_writelane_b32 v1, s13, 41 -; GCN-NEXT: v_writelane_b32 v1, s14, 42 -; GCN-NEXT: v_writelane_b32 v1, s15, 43 -; GCN-NEXT: v_writelane_b32 v1, s16, 44 -; GCN-NEXT: v_writelane_b32 v1, s17, 45 -; GCN-NEXT: v_writelane_b32 v1, s18, 46 -; GCN-NEXT: v_writelane_b32 v1, s19, 47 +; GCN-NEXT: v_writelane_b32 v31, s4, 32 +; GCN-NEXT: v_writelane_b32 v31, s5, 33 +; GCN-NEXT: v_writelane_b32 v31, s6, 34 +; GCN-NEXT: v_writelane_b32 v31, s7, 35 +; GCN-NEXT: v_writelane_b32 v31, s8, 36 +; GCN-NEXT: v_writelane_b32 v31, s9, 37 +; GCN-NEXT: v_writelane_b32 v31, s10, 38 +; GCN-NEXT: v_writelane_b32 v31, s11, 39 +; GCN-NEXT: v_writelane_b32 v31, s12, 40 +; GCN-NEXT: v_writelane_b32 v31, s13, 41 +; GCN-NEXT: v_writelane_b32 v31, s14, 42 +; GCN-NEXT: v_writelane_b32 v31, s15, 43 +; GCN-NEXT: v_writelane_b32 v31, s16, 44 +; GCN-NEXT: v_writelane_b32 v31, s17, 45 +; GCN-NEXT: v_writelane_b32 v31, s18, 46 +; GCN-NEXT: v_writelane_b32 v31, s19, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s4, 48 -; GCN-NEXT: v_writelane_b32 v1, s5, 49 -; GCN-NEXT: v_writelane_b32 v1, s6, 50 -; GCN-NEXT: v_writelane_b32 v1, s7, 51 -; GCN-NEXT: v_writelane_b32 v1, s8, 52 -; GCN-NEXT: v_writelane_b32 v1, s9, 53 -; GCN-NEXT: v_writelane_b32 v1, s10, 54 -; GCN-NEXT: v_writelane_b32 v1, s11, 55 -; GCN-NEXT: v_writelane_b32 v1, s12, 56 -; GCN-NEXT: v_writelane_b32 v1, s13, 57 -; GCN-NEXT: v_writelane_b32 v1, s14, 58 -; GCN-NEXT: v_writelane_b32 v1, s15, 59 -; GCN-NEXT: v_writelane_b32 v1, s16, 60 -; GCN-NEXT: v_writelane_b32 v1, s17, 61 -; GCN-NEXT: v_writelane_b32 v1, s18, 62 -; GCN-NEXT: v_writelane_b32 v1, s19, 63 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: v_writelane_b32 v31, s4, 48 +; GCN-NEXT: v_writelane_b32 v31, s5, 49 +; GCN-NEXT: v_writelane_b32 v31, s6, 50 +; GCN-NEXT: v_writelane_b32 v31, s7, 51 +; GCN-NEXT: v_writelane_b32 v31, s8, 52 +; GCN-NEXT: v_writelane_b32 v31, s9, 53 +; GCN-NEXT: v_writelane_b32 v31, s10, 54 +; GCN-NEXT: v_writelane_b32 v31, s11, 55 +; GCN-NEXT: v_writelane_b32 v31, s12, 56 +; GCN-NEXT: v_writelane_b32 v31, s13, 57 +; GCN-NEXT: v_writelane_b32 v31, s14, 58 +; GCN-NEXT: v_writelane_b32 v31, s15, 59 +; GCN-NEXT: v_writelane_b32 v31, s16, 60 +; GCN-NEXT: v_writelane_b32 v31, s17, 61 +; GCN-NEXT: v_writelane_b32 v31, s18, 62 +; GCN-NEXT: v_writelane_b32 v31, s19, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[2:3] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: s_mov_b64 s[4:5], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 ; GCN-NEXT: v_writelane_b32 v0, s2, 0 ; GCN-NEXT: v_writelane_b32 v0, s3, 1 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] +; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_mov_b32 s1, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s0, s1 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v2, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s36, v2, 32 -; GCN-NEXT: v_readlane_b32 s37, v2, 33 -; GCN-NEXT: v_readlane_b32 s38, v2, 34 -; GCN-NEXT: v_readlane_b32 s39, v2, 35 -; GCN-NEXT: v_readlane_b32 s40, v2, 36 -; GCN-NEXT: v_readlane_b32 s41, v2, 37 -; GCN-NEXT: v_readlane_b32 s42, v2, 38 -; GCN-NEXT: v_readlane_b32 s43, v2, 39 -; GCN-NEXT: v_readlane_b32 s44, v2, 40 -; GCN-NEXT: v_readlane_b32 s45, v2, 41 -; GCN-NEXT: v_readlane_b32 s46, v2, 42 -; GCN-NEXT: v_readlane_b32 s47, v2, 43 -; GCN-NEXT: v_readlane_b32 s48, v2, 44 -; GCN-NEXT: v_readlane_b32 s49, v2, 45 -; GCN-NEXT: v_readlane_b32 s50, v2, 46 -; GCN-NEXT: v_readlane_b32 s51, v2, 47 -; GCN-NEXT: v_readlane_b32 s0, v2, 16 -; GCN-NEXT: v_readlane_b32 s1, v2, 17 -; GCN-NEXT: v_readlane_b32 s2, v2, 18 -; GCN-NEXT: v_readlane_b32 s3, v2, 19 -; GCN-NEXT: v_readlane_b32 s4, v2, 20 -; GCN-NEXT: v_readlane_b32 s5, v2, 21 -; GCN-NEXT: v_readlane_b32 s6, v2, 22 -; GCN-NEXT: v_readlane_b32 s7, v2, 23 -; GCN-NEXT: v_readlane_b32 s8, v2, 24 -; GCN-NEXT: v_readlane_b32 s9, v2, 25 -; GCN-NEXT: v_readlane_b32 s10, v2, 26 -; GCN-NEXT: v_readlane_b32 s11, v2, 27 -; GCN-NEXT: v_readlane_b32 s12, v2, 28 -; GCN-NEXT: v_readlane_b32 s13, v2, 29 -; GCN-NEXT: v_readlane_b32 s14, v2, 30 -; GCN-NEXT: v_readlane_b32 s15, v2, 31 -; GCN-NEXT: v_readlane_b32 s16, v2, 0 -; GCN-NEXT: v_readlane_b32 s17, v2, 1 -; GCN-NEXT: v_readlane_b32 s18, v2, 2 -; GCN-NEXT: v_readlane_b32 s19, v2, 3 -; GCN-NEXT: v_readlane_b32 s20, v2, 4 -; GCN-NEXT: v_readlane_b32 s21, v2, 5 -; GCN-NEXT: v_readlane_b32 s22, v2, 6 -; GCN-NEXT: v_readlane_b32 s23, v2, 7 -; GCN-NEXT: v_readlane_b32 s24, v2, 8 -; GCN-NEXT: v_readlane_b32 s25, v2, 9 -; GCN-NEXT: v_readlane_b32 s26, v2, 10 -; GCN-NEXT: v_readlane_b32 s27, v2, 11 -; GCN-NEXT: v_readlane_b32 s28, v2, 12 -; GCN-NEXT: v_readlane_b32 s29, v2, 13 -; GCN-NEXT: v_readlane_b32 s30, v2, 14 -; GCN-NEXT: v_readlane_b32 s31, v2, 15 +; GCN-NEXT: v_readlane_b32 s36, v31, 32 +; GCN-NEXT: v_readlane_b32 s37, v31, 33 +; GCN-NEXT: v_readlane_b32 s38, v31, 34 +; GCN-NEXT: v_readlane_b32 s39, v31, 35 +; GCN-NEXT: v_readlane_b32 s40, v31, 36 +; GCN-NEXT: v_readlane_b32 s41, v31, 37 +; GCN-NEXT: v_readlane_b32 s42, v31, 38 +; GCN-NEXT: v_readlane_b32 s43, v31, 39 +; GCN-NEXT: v_readlane_b32 s44, v31, 40 +; GCN-NEXT: v_readlane_b32 s45, v31, 41 +; GCN-NEXT: v_readlane_b32 s46, v31, 42 +; GCN-NEXT: v_readlane_b32 s47, v31, 43 +; GCN-NEXT: v_readlane_b32 s48, v31, 44 +; GCN-NEXT: v_readlane_b32 s49, v31, 45 +; GCN-NEXT: v_readlane_b32 s50, v31, 46 +; GCN-NEXT: v_readlane_b32 s51, v31, 47 +; GCN-NEXT: v_readlane_b32 s0, v31, 16 +; GCN-NEXT: v_readlane_b32 s1, v31, 17 +; GCN-NEXT: v_readlane_b32 s2, v31, 18 +; GCN-NEXT: v_readlane_b32 s3, v31, 19 +; GCN-NEXT: v_readlane_b32 s4, v31, 20 +; GCN-NEXT: v_readlane_b32 s5, v31, 21 +; GCN-NEXT: v_readlane_b32 s6, v31, 22 +; GCN-NEXT: v_readlane_b32 s7, v31, 23 +; GCN-NEXT: v_readlane_b32 s8, v31, 24 +; GCN-NEXT: v_readlane_b32 s9, v31, 25 +; GCN-NEXT: v_readlane_b32 s10, v31, 26 +; GCN-NEXT: v_readlane_b32 s11, v31, 27 +; GCN-NEXT: v_readlane_b32 s12, v31, 28 +; GCN-NEXT: v_readlane_b32 s13, v31, 29 +; GCN-NEXT: v_readlane_b32 s14, v31, 30 +; GCN-NEXT: v_readlane_b32 s15, v31, 31 +; GCN-NEXT: v_readlane_b32 s16, v31, 0 +; GCN-NEXT: v_readlane_b32 s17, v31, 1 +; GCN-NEXT: v_readlane_b32 s18, v31, 2 +; GCN-NEXT: v_readlane_b32 s19, v31, 3 +; GCN-NEXT: v_readlane_b32 s20, v31, 4 +; GCN-NEXT: v_readlane_b32 s21, v31, 5 +; GCN-NEXT: v_readlane_b32 s22, v31, 6 +; GCN-NEXT: v_readlane_b32 s23, v31, 7 +; GCN-NEXT: v_readlane_b32 s24, v31, 8 +; GCN-NEXT: v_readlane_b32 s25, v31, 9 +; GCN-NEXT: v_readlane_b32 s26, v31, 10 +; GCN-NEXT: v_readlane_b32 s27, v31, 11 +; GCN-NEXT: v_readlane_b32 s28, v31, 12 +; GCN-NEXT: v_readlane_b32 s29, v31, 13 +; GCN-NEXT: v_readlane_b32 s30, v31, 14 +; GCN-NEXT: v_readlane_b32 s31, v31, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def v0 ; GCN-NEXT: ;;#ASMEND @@ -1168,24 +1067,32 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[0:15] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v2, 48 -; GCN-NEXT: v_readlane_b32 s5, v2, 49 -; GCN-NEXT: v_readlane_b32 s6, v2, 50 -; GCN-NEXT: v_readlane_b32 s7, v2, 51 -; GCN-NEXT: v_readlane_b32 s8, v2, 52 -; GCN-NEXT: v_readlane_b32 s9, v2, 53 -; GCN-NEXT: v_readlane_b32 s10, v2, 54 -; GCN-NEXT: v_readlane_b32 s11, v2, 55 -; GCN-NEXT: v_readlane_b32 s12, v2, 56 -; GCN-NEXT: v_readlane_b32 s13, v2, 57 -; GCN-NEXT: v_readlane_b32 s14, v2, 58 -; GCN-NEXT: v_readlane_b32 s15, v2, 59 -; GCN-NEXT: v_readlane_b32 s16, v2, 60 -; GCN-NEXT: v_readlane_b32 s17, v2, 61 -; GCN-NEXT: v_readlane_b32 s18, v2, 62 -; GCN-NEXT: v_readlane_b32 s19, v2, 63 +; GCN-NEXT: v_readlane_b32 s4, v31, 48 +; GCN-NEXT: v_readlane_b32 s5, v31, 49 +; GCN-NEXT: v_readlane_b32 s6, v31, 50 +; GCN-NEXT: v_readlane_b32 s7, v31, 51 +; GCN-NEXT: v_readlane_b32 s8, v31, 52 +; GCN-NEXT: v_readlane_b32 s9, v31, 53 +; GCN-NEXT: v_readlane_b32 s10, v31, 54 +; GCN-NEXT: v_readlane_b32 s11, v31, 55 +; GCN-NEXT: v_readlane_b32 s12, v31, 56 +; GCN-NEXT: v_readlane_b32 s13, v31, 57 +; GCN-NEXT: v_readlane_b32 s14, v31, 58 +; GCN-NEXT: v_readlane_b32 s15, v31, 59 +; GCN-NEXT: v_readlane_b32 s16, v31, 60 +; GCN-NEXT: v_readlane_b32 s17, v31, 61 +; GCN-NEXT: v_readlane_b32 s18, v31, 62 +; GCN-NEXT: v_readlane_b32 s19, v31, 63 +; GCN-NEXT: s_mov_b64 s[2:3], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v1, off, s[52:55], 0 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s0, v1, 0 ; GCN-NEXT: v_readlane_b32 s1, v1, 1 +; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[2:3] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[36:51] ; GCN-NEXT: ;;#ASMEND @@ -1199,14 +1106,6 @@ ; GCN-NEXT: ; use v0 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB3_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[52:55], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[52:55], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll b/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/preserve-wwm-copy-dst-reg.ll +++ /dev/null @@ -1,816 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX906 %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -verify-machineinstrs < %s | FileCheck -check-prefix=GFX908 %s - -; Due to high register pressure, regalloc would split the liverange of wwm VGPR register used for SGPR spills -; and introduce a copy. The copy should be of whole-wave with exec mask manipulation around it. -; FIXME: The destination register involved in the whole-wave copy should be considered for preserving all the lanes -; with a spill/restore at function prolog/epilog. The copy might otherwise clobber its inactive lanes unwantedly. -define void @preserve_wwm_copy_dstreg(ptr %parg0, ptr %parg1, ptr %parg2) #0 { -; GFX906-LABEL: preserve_wwm_copy_dstreg: -; GFX906: ; %bb.0: -; GFX906-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_mov_b32 s16, s33 -; GFX906-NEXT: s_mov_b32 s33, s32 -; GFX906-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GFX906-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill -; GFX906-NEXT: s_mov_b64 exec, -1 -; GFX906-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill -; GFX906-NEXT: s_mov_b64 exec, s[18:19] -; GFX906-NEXT: s_mov_b32 s21, s15 -; GFX906-NEXT: ; implicit-def: $vgpr2 -; GFX906-NEXT: s_mov_b32 s22, s14 -; GFX906-NEXT: v_writelane_b32 v2, s21, 0 -; GFX906-NEXT: v_writelane_b32 v2, s22, 1 -; GFX906-NEXT: s_mov_b32 s23, s13 -; GFX906-NEXT: v_writelane_b32 v2, s23, 2 -; GFX906-NEXT: s_mov_b32 s24, s12 -; GFX906-NEXT: v_writelane_b32 v2, s24, 3 -; GFX906-NEXT: s_mov_b64 s[26:27], s[10:11] -; GFX906-NEXT: v_writelane_b32 v2, s26, 4 -; GFX906-NEXT: v_writelane_b32 v2, s27, 5 -; GFX906-NEXT: v_writelane_b32 v2, s8, 6 -; GFX906-NEXT: v_writelane_b32 v2, s9, 7 -; GFX906-NEXT: v_writelane_b32 v2, s6, 8 -; GFX906-NEXT: v_writelane_b32 v41, s16, 2 -; GFX906-NEXT: v_writelane_b32 v2, s7, 9 -; GFX906-NEXT: v_writelane_b32 v41, s30, 0 -; GFX906-NEXT: v_writelane_b32 v2, s4, 10 -; GFX906-NEXT: s_addk_i32 s32, 0x2800 -; GFX906-NEXT: v_writelane_b32 v41, s31, 1 -; GFX906-NEXT: v_mov_b32_e32 v32, v31 -; GFX906-NEXT: v_writelane_b32 v2, s5, 11 -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: v_mov_b32_e32 v33, v2 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def v[0:31] -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill -; GFX906-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def v40 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s11 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: v_mov_b32_e32 v40, v33 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_writelane_b32 v40, s11, 12 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s12 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s12, 13 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s13 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s13, 14 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s14 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s14, 15 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s15 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s15, 16 -; GFX906-NEXT: s_getpc_b64 s[10:11] -; GFX906-NEXT: s_add_u32 s10, s10, foo@gotpcrel32@lo+4 -; GFX906-NEXT: s_addc_u32 s11, s11, foo@gotpcrel32@hi+12 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s16 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s16, 17 -; GFX906-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s17 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s17, 18 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s18 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s18, 19 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s19 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s19, 20 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s20 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_writelane_b32 v40, s20, 21 -; GFX906-NEXT: s_waitcnt lgkmcnt(0) -; GFX906-NEXT: v_writelane_b32 v40, s10, 22 -; GFX906-NEXT: v_writelane_b32 v40, s11, 23 -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s16, v40, 22 -; GFX906-NEXT: s_mov_b32 s12, s24 -; GFX906-NEXT: s_mov_b32 s13, s23 -; GFX906-NEXT: s_mov_b32 s14, s22 -; GFX906-NEXT: v_mov_b32_e32 v31, v32 -; GFX906-NEXT: s_mov_b32 s15, s21 -; GFX906-NEXT: s_mov_b64 s[10:11], s[26:27] -; GFX906-NEXT: v_readlane_b32 s17, v40, 23 -; GFX906-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s11, v40, 12 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s11 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s12, v40, 13 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s12 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s13, v40, 14 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s13 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s14, v40, 15 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s14 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s15, v40, 16 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s15 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s16, v40, 17 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s16 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s17, v40, 18 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s17 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s18, v40, 19 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s18 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s19, v40, 20 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s19 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s20, v40, 21 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s20 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s21 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s22 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s23 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s24 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s25 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s26 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s27 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s28 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; def s29 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: v_writelane_b32 v40, s21, 24 -; GFX906-NEXT: v_writelane_b32 v40, s22, 25 -; GFX906-NEXT: v_writelane_b32 v40, s23, 26 -; GFX906-NEXT: v_writelane_b32 v40, s24, 27 -; GFX906-NEXT: v_writelane_b32 v40, s25, 28 -; GFX906-NEXT: v_writelane_b32 v40, s26, 29 -; GFX906-NEXT: v_writelane_b32 v40, s27, 30 -; GFX906-NEXT: v_writelane_b32 v40, s28, 31 -; GFX906-NEXT: v_writelane_b32 v40, s29, 32 -; GFX906-NEXT: v_readlane_b32 s4, v40, 10 -; GFX906-NEXT: v_readlane_b32 s6, v40, 8 -; GFX906-NEXT: v_readlane_b32 s8, v40, 6 -; GFX906-NEXT: v_readlane_b32 s10, v40, 4 -; GFX906-NEXT: v_readlane_b32 s16, v40, 22 -; GFX906-NEXT: v_readlane_b32 s12, v40, 3 -; GFX906-NEXT: v_readlane_b32 s13, v40, 2 -; GFX906-NEXT: v_readlane_b32 s14, v40, 1 -; GFX906-NEXT: v_readlane_b32 s15, v40, 0 -; GFX906-NEXT: v_readlane_b32 s5, v40, 11 -; GFX906-NEXT: v_readlane_b32 s7, v40, 9 -; GFX906-NEXT: v_readlane_b32 s9, v40, 7 -; GFX906-NEXT: v_readlane_b32 s11, v40, 5 -; GFX906-NEXT: v_readlane_b32 s17, v40, 23 -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: v_readlane_b32 s21, v40, 24 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s21 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s22, v40, 25 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s22 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s23, v40, 26 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s23 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s24, v40, 27 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s24 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s25, v40, 28 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s25 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s26, v40, 29 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s26 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s27, v40, 30 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s27 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s28, v40, 31 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s28 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: v_readlane_b32 s29, v40, 32 -; GFX906-NEXT: ;;#ASMSTART -; GFX906-NEXT: ; use s29 -; GFX906-NEXT: ;;#ASMEND -; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX906-NEXT: v_readlane_b32 s4, v40, 10 -; GFX906-NEXT: v_readlane_b32 s6, v40, 8 -; GFX906-NEXT: v_readlane_b32 s8, v40, 6 -; GFX906-NEXT: v_readlane_b32 s10, v40, 4 -; GFX906-NEXT: v_readlane_b32 s16, v40, 22 -; GFX906-NEXT: v_readlane_b32 s5, v40, 11 -; GFX906-NEXT: v_readlane_b32 s7, v40, 9 -; GFX906-NEXT: v_readlane_b32 s9, v40, 7 -; GFX906-NEXT: v_readlane_b32 s11, v40, 5 -; GFX906-NEXT: v_readlane_b32 s12, v40, 3 -; GFX906-NEXT: v_readlane_b32 s13, v40, 2 -; GFX906-NEXT: v_readlane_b32 s14, v40, 1 -; GFX906-NEXT: v_readlane_b32 s15, v40, 0 -; GFX906-NEXT: v_readlane_b32 s17, v40, 23 -; GFX906-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX906-NEXT: s_mov_b64 exec, s[34:35] -; GFX906-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX906-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload -; GFX906-NEXT: v_readlane_b32 s31, v41, 1 -; GFX906-NEXT: v_readlane_b32 s30, v41, 0 -; GFX906-NEXT: ; kill: killed $vgpr40 -; GFX906-NEXT: v_readlane_b32 s4, v41, 2 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16 -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GFX906-NEXT: s_waitcnt vmcnt(0) -; GFX906-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX906-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload -; GFX906-NEXT: s_mov_b64 exec, -1 -; GFX906-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX906-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload -; GFX906-NEXT: s_mov_b64 exec, s[6:7] -; GFX906-NEXT: s_addk_i32 s32, 0xd800 -; GFX906-NEXT: s_mov_b32 s33, s4 -; GFX906-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX906-NEXT: s_setpc_b64 s[30:31] -; -; GFX908-LABEL: preserve_wwm_copy_dstreg: -; GFX908: ; %bb.0: -; GFX908-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_mov_b32 s16, s33 -; GFX908-NEXT: s_mov_b32 s33, s32 -; GFX908-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GFX908-NEXT: buffer_store_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Spill -; GFX908-NEXT: s_mov_b64 exec, -1 -; GFX908-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Spill -; GFX908-NEXT: s_mov_b64 exec, s[18:19] -; GFX908-NEXT: v_mov_b32_e32 v3, s16 -; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:160 ; 4-byte Folded Spill -; GFX908-NEXT: s_addk_i32 s32, 0x2c00 -; GFX908-NEXT: s_mov_b64 s[16:17], exec -; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164 -; GFX908-NEXT: v_writelane_b32 v2, s30, 0 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_mov_b64 exec, s[16:17] -; GFX908-NEXT: s_mov_b64 s[16:17], exec -; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:164 -; GFX908-NEXT: v_writelane_b32 v2, s31, 0 -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:164 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_mov_b64 exec, s[16:17] -; GFX908-NEXT: s_mov_b32 s21, s15 -; GFX908-NEXT: ; implicit-def: $vgpr2 -; GFX908-NEXT: s_mov_b32 s22, s14 -; GFX908-NEXT: v_writelane_b32 v2, s21, 0 -; GFX908-NEXT: v_writelane_b32 v2, s22, 1 -; GFX908-NEXT: s_mov_b32 s23, s13 -; GFX908-NEXT: v_writelane_b32 v2, s23, 2 -; GFX908-NEXT: s_mov_b32 s24, s12 -; GFX908-NEXT: v_writelane_b32 v2, s24, 3 -; GFX908-NEXT: s_mov_b64 s[26:27], s[10:11] -; GFX908-NEXT: v_writelane_b32 v2, s26, 4 -; GFX908-NEXT: v_writelane_b32 v2, s27, 5 -; GFX908-NEXT: v_writelane_b32 v2, s8, 6 -; GFX908-NEXT: v_writelane_b32 v2, s9, 7 -; GFX908-NEXT: v_writelane_b32 v2, s6, 8 -; GFX908-NEXT: v_writelane_b32 v2, s7, 9 -; GFX908-NEXT: v_writelane_b32 v2, s4, 10 -; GFX908-NEXT: v_mov_b32_e32 v32, v31 -; GFX908-NEXT: v_writelane_b32 v2, s5, 11 -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: v_mov_b32_e32 v33, v2 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def v[0:31] -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v7, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:56 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v10, off, s[0:3], s33 offset:60 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v11, off, s[0:3], s33 offset:64 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v12, off, s[0:3], s33 offset:68 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v13, off, s[0:3], s33 offset:72 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v14, off, s[0:3], s33 offset:76 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v15, off, s[0:3], s33 offset:80 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v16, off, s[0:3], s33 offset:84 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v17, off, s[0:3], s33 offset:88 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v18, off, s[0:3], s33 offset:92 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v19, off, s[0:3], s33 offset:96 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v20, off, s[0:3], s33 offset:100 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v21, off, s[0:3], s33 offset:104 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v22, off, s[0:3], s33 offset:108 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v23, off, s[0:3], s33 offset:112 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v24, off, s[0:3], s33 offset:116 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v25, off, s[0:3], s33 offset:120 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v26, off, s[0:3], s33 offset:124 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v27, off, s[0:3], s33 offset:128 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v28, off, s[0:3], s33 offset:132 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v29, off, s[0:3], s33 offset:136 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v30, off, s[0:3], s33 offset:140 ; 4-byte Folded Spill -; GFX908-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:144 ; 4-byte Folded Spill -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def v40 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s11 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: v_mov_b32_e32 v40, v33 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_writelane_b32 v40, s11, 12 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s12 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s12, 13 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s13 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s13, 14 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s14 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s14, 15 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s15 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s15, 16 -; GFX908-NEXT: s_getpc_b64 s[10:11] -; GFX908-NEXT: s_add_u32 s10, s10, foo@gotpcrel32@lo+4 -; GFX908-NEXT: s_addc_u32 s11, s11, foo@gotpcrel32@hi+12 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s16 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s16, 17 -; GFX908-NEXT: s_load_dwordx2 s[10:11], s[10:11], 0x0 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s17 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s17, 18 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s18 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s18, 19 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s19 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s19, 20 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s20 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_writelane_b32 v40, s20, 21 -; GFX908-NEXT: s_waitcnt lgkmcnt(0) -; GFX908-NEXT: v_writelane_b32 v40, s10, 22 -; GFX908-NEXT: v_writelane_b32 v40, s11, 23 -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s16, v40, 22 -; GFX908-NEXT: s_mov_b32 s12, s24 -; GFX908-NEXT: s_mov_b32 s13, s23 -; GFX908-NEXT: s_mov_b32 s14, s22 -; GFX908-NEXT: v_mov_b32_e32 v31, v32 -; GFX908-NEXT: s_mov_b32 s15, s21 -; GFX908-NEXT: s_mov_b64 s[10:11], s[26:27] -; GFX908-NEXT: v_readlane_b32 s17, v40, 23 -; GFX908-NEXT: buffer_store_dword v32, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s11, v40, 12 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s11 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s12, v40, 13 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s12 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s13, v40, 14 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s13 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s14, v40, 15 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s14 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s15, v40, 16 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s15 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s16, v40, 17 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s16 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s17, v40, 18 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s17 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s18, v40, 19 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s18 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s19, v40, 20 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s19 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s20, v40, 21 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s20 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s21 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s22 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s23 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s24 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s25 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s26 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s27 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s28 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; def s29 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX908-NEXT: v_writelane_b32 v40, s21, 24 -; GFX908-NEXT: v_writelane_b32 v40, s22, 25 -; GFX908-NEXT: v_writelane_b32 v40, s23, 26 -; GFX908-NEXT: v_writelane_b32 v40, s24, 27 -; GFX908-NEXT: v_writelane_b32 v40, s25, 28 -; GFX908-NEXT: v_writelane_b32 v40, s26, 29 -; GFX908-NEXT: v_writelane_b32 v40, s27, 30 -; GFX908-NEXT: v_writelane_b32 v40, s28, 31 -; GFX908-NEXT: v_writelane_b32 v40, s29, 32 -; GFX908-NEXT: v_readlane_b32 s4, v40, 10 -; GFX908-NEXT: v_readlane_b32 s6, v40, 8 -; GFX908-NEXT: v_readlane_b32 s8, v40, 6 -; GFX908-NEXT: v_readlane_b32 s10, v40, 4 -; GFX908-NEXT: v_readlane_b32 s16, v40, 22 -; GFX908-NEXT: v_readlane_b32 s12, v40, 3 -; GFX908-NEXT: v_readlane_b32 s13, v40, 2 -; GFX908-NEXT: v_readlane_b32 s14, v40, 1 -; GFX908-NEXT: v_readlane_b32 s15, v40, 0 -; GFX908-NEXT: v_readlane_b32 s5, v40, 11 -; GFX908-NEXT: v_readlane_b32 s7, v40, 9 -; GFX908-NEXT: v_readlane_b32 s9, v40, 7 -; GFX908-NEXT: v_readlane_b32 s11, v40, 5 -; GFX908-NEXT: v_readlane_b32 s17, v40, 23 -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: v_readlane_b32 s21, v40, 24 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s21 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s22, v40, 25 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s22 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s23, v40, 26 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s23 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s24, v40, 27 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s24 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s25, v40, 28 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s25 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s26, v40, 29 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s26 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s27, v40, 30 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s27 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s28, v40, 31 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s28 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: v_readlane_b32 s29, v40, 32 -; GFX908-NEXT: ;;#ASMSTART -; GFX908-NEXT: ; use s29 -; GFX908-NEXT: ;;#ASMEND -; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX908-NEXT: v_readlane_b32 s4, v40, 10 -; GFX908-NEXT: v_readlane_b32 s6, v40, 8 -; GFX908-NEXT: v_readlane_b32 s8, v40, 6 -; GFX908-NEXT: v_readlane_b32 s10, v40, 4 -; GFX908-NEXT: v_readlane_b32 s16, v40, 22 -; GFX908-NEXT: v_readlane_b32 s5, v40, 11 -; GFX908-NEXT: v_readlane_b32 s7, v40, 9 -; GFX908-NEXT: v_readlane_b32 s9, v40, 7 -; GFX908-NEXT: v_readlane_b32 s11, v40, 5 -; GFX908-NEXT: v_readlane_b32 s12, v40, 3 -; GFX908-NEXT: v_readlane_b32 s13, v40, 2 -; GFX908-NEXT: v_readlane_b32 s14, v40, 1 -; GFX908-NEXT: v_readlane_b32 s15, v40, 0 -; GFX908-NEXT: v_readlane_b32 s17, v40, 23 -; GFX908-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX908-NEXT: s_mov_b64 exec, s[34:35] -; GFX908-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v7, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v10, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v11, off, s[0:3], s33 offset:56 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v12, off, s[0:3], s33 offset:60 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v13, off, s[0:3], s33 offset:64 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v14, off, s[0:3], s33 offset:68 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v15, off, s[0:3], s33 offset:72 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v16, off, s[0:3], s33 offset:76 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v17, off, s[0:3], s33 offset:80 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v18, off, s[0:3], s33 offset:84 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v19, off, s[0:3], s33 offset:88 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v20, off, s[0:3], s33 offset:92 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v21, off, s[0:3], s33 offset:96 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v22, off, s[0:3], s33 offset:100 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v23, off, s[0:3], s33 offset:104 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v24, off, s[0:3], s33 offset:108 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v25, off, s[0:3], s33 offset:112 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v26, off, s[0:3], s33 offset:116 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v27, off, s[0:3], s33 offset:120 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v28, off, s[0:3], s33 offset:124 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v29, off, s[0:3], s33 offset:128 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v30, off, s[0:3], s33 offset:132 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:136 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v32, off, s[0:3], s33 offset:140 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:144 ; 4-byte Folded Reload -; GFX908-NEXT: s_mov_b64 s[4:5], exec -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[30:33] offset:112 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[26:29] offset:96 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[22:25] offset:80 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[18:21] offset:64 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[14:17] offset:48 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[10:13] offset:32 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[6:9] offset:16 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: flat_store_dwordx4 v[0:1], v[2:5] -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readlane_b32 s31, v0, 0 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_mov_b64 exec, s[4:5] -; GFX908-NEXT: s_mov_b64 s[4:5], exec -; GFX908-NEXT: s_mov_b64 exec, 1 -; GFX908-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:164 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readlane_b32 s30, v0, 0 -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:164 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: s_mov_b64 exec, s[4:5] -; GFX908-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:160 ; 4-byte Folded Reload -; GFX908-NEXT: ; kill: killed $vgpr40 -; GFX908-NEXT: s_waitcnt vmcnt(0) -; GFX908-NEXT: v_readfirstlane_b32 s4, v0 -; GFX908-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX908-NEXT: buffer_load_dword v33, off, s[0:3], s33 offset:148 ; 4-byte Folded Reload -; GFX908-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:156 ; 4-byte Folded Reload -; GFX908-NEXT: s_mov_b64 exec, -1 -; GFX908-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:152 ; 4-byte Folded Reload -; GFX908-NEXT: s_mov_b64 exec, s[6:7] -; GFX908-NEXT: s_addk_i32 s32, 0xd400 -; GFX908-NEXT: s_mov_b32 s33, s4 -; GFX908-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) -; GFX908-NEXT: s_setpc_b64 s[30:31] - %vreg0 = call <32 x float> asm sideeffect "; def $0", "=v"() - %v40 = call i32 asm sideeffect "; def $0","=${v40}"() - - %s11 = call i32 asm sideeffect "; def $0","=${s11}"() - %s12 = call i32 asm sideeffect "; def $0","=${s12}"() - %s13 = call i32 asm sideeffect "; def $0","=${s13}"() - %s14 = call i32 asm sideeffect "; def $0","=${s14}"() - %s15 = call i32 asm sideeffect "; def $0","=${s15}"() - %s16 = call i32 asm sideeffect "; def $0","=${s16}"() - %s17 = call i32 asm sideeffect "; def $0","=${s17}"() - %s18 = call i32 asm sideeffect "; def $0","=${s18}"() - %s19 = call i32 asm sideeffect "; def $0","=${s19}"() - %s20 = call i32 asm sideeffect "; def $0","=${s20}"() - call void @foo() - call void asm sideeffect "; use $0","${s11}"(i32 %s11) - call void asm sideeffect "; use $0","${s12}"(i32 %s12) - call void asm sideeffect "; use $0","${s13}"(i32 %s13) - call void asm sideeffect "; use $0","${s14}"(i32 %s14) - call void asm sideeffect "; use $0","${s15}"(i32 %s15) - call void asm sideeffect "; use $0","${s16}"(i32 %s16) - call void asm sideeffect "; use $0","${s17}"(i32 %s17) - call void asm sideeffect "; use $0","${s18}"(i32 %s18) - call void asm sideeffect "; use $0","${s19}"(i32 %s19) - call void asm sideeffect "; use $0","${s20}"(i32 %s20) - - %s21 = call i32 asm sideeffect "; def $0","=${s21}"() - %s22 = call i32 asm sideeffect "; def $0","=${s22}"() - %s23 = call i32 asm sideeffect "; def $0","=${s23}"() - %s24 = call i32 asm sideeffect "; def $0","=${s24}"() - %s25 = call i32 asm sideeffect "; def $0","=${s25}"() - %s26 = call i32 asm sideeffect "; def $0","=${s26}"() - %s27 = call i32 asm sideeffect "; def $0","=${s27}"() - %s28 = call i32 asm sideeffect "; def $0","=${s28}"() - %s29 = call i32 asm sideeffect "; def $0","=${s29}"() - call void @foo() - call void asm sideeffect "; use $0","${s21}"(i32 %s21) - call void asm sideeffect "; use $0","${s22}"(i32 %s22) - call void asm sideeffect "; use $0","${s23}"(i32 %s23) - call void asm sideeffect "; use $0","${s24}"(i32 %s24) - call void asm sideeffect "; use $0","${s25}"(i32 %s25) - call void asm sideeffect "; use $0","${s26}"(i32 %s26) - call void asm sideeffect "; use $0","${s27}"(i32 %s27) - call void asm sideeffect "; use $0","${s28}"(i32 %s28) - call void asm sideeffect "; use $0","${s29}"(i32 %s29) - - call void @foo() - - store volatile <32 x float> %vreg0, ptr %parg0 - - ret void -} - -declare void @foo() - -attributes #0 = { "amdgpu-num-vgpr"="42" "amdgpu-num-sgpr"="40"} diff --git a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll --- a/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/scc-clobbered-sgpr-to-vmem-spill.ll @@ -1,381 +1,22 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2 -; RUN: llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck %s +; RUN: not --crash llc -mtriple=amdgcn--amdhsa -mcpu=gfx900 -verify-machineinstrs -o /dev/null %s 2>&1 | FileCheck %s -; This was a negative test to catch an extreme case when all options are exhausted -; while trying to spill SGPRs to memory. After we enabled SGPR spills into virtual VGPRs -; the edge case won't arise and the test would always compile. +; This ends up needing to spill SGPRs to memory, and also does not +; have any free SGPRs available to save the exec mask when doing so. +; The register scavenger also needs to use the emergency stack slot, +; which tries to place the scavenged register restore instruction as +; far the block as possible, near the terminator. This places a +; restore instruction between the condition and the conditional +; branch, which gets expanded into a sequence involving s_not_b64 on +; the exec mask, clobbering SCC value before the branch. We probably +; have to stop relying on being able to flip and restore the exec +; mask, and always require a free SGPR for saving exec. +; CHECK: *** Bad machine code: Using an undefined physical register *** +; CHECK-NEXT: - function: kernel0 +; CHECK-NEXT: - basic block: %bb.0 +; CHECK-NEXT: - instruction: S_CBRANCH_SCC1 %bb.2, implicit killed $scc +; CHECK-NEXT: - operand 1: implicit killed $scc define amdgpu_kernel void @kernel0(ptr addrspace(1) %out, i32 %in) #1 { -; CHECK-LABEL: kernel0: -; CHECK: ; %bb.0: -; CHECK-NEXT: ; implicit-def: $vgpr23 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s2, 0 -; CHECK-NEXT: s_load_dword s0, s[4:5], 0x8 -; CHECK-NEXT: v_writelane_b32 v23, s3, 1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 2 -; CHECK-NEXT: v_writelane_b32 v23, s5, 3 -; CHECK-NEXT: v_writelane_b32 v23, s6, 4 -; CHECK-NEXT: v_writelane_b32 v23, s7, 5 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:11] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 6 -; CHECK-NEXT: v_writelane_b32 v23, s5, 7 -; CHECK-NEXT: v_writelane_b32 v23, s6, 8 -; CHECK-NEXT: v_writelane_b32 v23, s7, 9 -; CHECK-NEXT: v_writelane_b32 v23, s8, 10 -; CHECK-NEXT: v_writelane_b32 v23, s9, 11 -; CHECK-NEXT: v_writelane_b32 v23, s10, 12 -; CHECK-NEXT: v_writelane_b32 v23, s11, 13 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:19] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 14 -; CHECK-NEXT: v_writelane_b32 v23, s5, 15 -; CHECK-NEXT: v_writelane_b32 v23, s6, 16 -; CHECK-NEXT: v_writelane_b32 v23, s7, 17 -; CHECK-NEXT: v_writelane_b32 v23, s8, 18 -; CHECK-NEXT: v_writelane_b32 v23, s9, 19 -; CHECK-NEXT: v_writelane_b32 v23, s10, 20 -; CHECK-NEXT: v_writelane_b32 v23, s11, 21 -; CHECK-NEXT: v_writelane_b32 v23, s12, 22 -; CHECK-NEXT: v_writelane_b32 v23, s13, 23 -; CHECK-NEXT: v_writelane_b32 v23, s14, 24 -; CHECK-NEXT: v_writelane_b32 v23, s15, 25 -; CHECK-NEXT: v_writelane_b32 v23, s16, 26 -; CHECK-NEXT: v_writelane_b32 v23, s17, 27 -; CHECK-NEXT: v_writelane_b32 v23, s18, 28 -; CHECK-NEXT: v_writelane_b32 v23, s19, 29 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[2:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s2, 30 -; CHECK-NEXT: v_writelane_b32 v23, s3, 31 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 32 -; CHECK-NEXT: v_writelane_b32 v23, s5, 33 -; CHECK-NEXT: v_writelane_b32 v23, s6, 34 -; CHECK-NEXT: v_writelane_b32 v23, s7, 35 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[4:11] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s4, 36 -; CHECK-NEXT: v_writelane_b32 v23, s5, 37 -; CHECK-NEXT: v_writelane_b32 v23, s6, 38 -; CHECK-NEXT: v_writelane_b32 v23, s7, 39 -; CHECK-NEXT: v_writelane_b32 v23, s8, 40 -; CHECK-NEXT: v_writelane_b32 v23, s9, 41 -; CHECK-NEXT: v_writelane_b32 v23, s10, 42 -; CHECK-NEXT: v_writelane_b32 v23, s11, 43 -; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: s_cmp_lg_u32 s0, 0 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[16:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[52:53] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[48:51] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[36:43] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 44 -; CHECK-NEXT: v_writelane_b32 v23, s1, 45 -; CHECK-NEXT: v_writelane_b32 v23, s2, 46 -; CHECK-NEXT: v_writelane_b32 v23, s3, 47 -; CHECK-NEXT: v_writelane_b32 v23, s4, 48 -; CHECK-NEXT: v_writelane_b32 v23, s5, 49 -; CHECK-NEXT: v_writelane_b32 v23, s6, 50 -; CHECK-NEXT: v_writelane_b32 v23, s7, 51 -; CHECK-NEXT: v_writelane_b32 v23, s8, 52 -; CHECK-NEXT: v_writelane_b32 v23, s9, 53 -; CHECK-NEXT: v_writelane_b32 v23, s10, 54 -; CHECK-NEXT: v_writelane_b32 v23, s11, 55 -; CHECK-NEXT: v_writelane_b32 v23, s12, 56 -; CHECK-NEXT: v_writelane_b32 v23, s13, 57 -; CHECK-NEXT: v_writelane_b32 v23, s14, 58 -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: v_writelane_b32 v23, s15, 59 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[34:35] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[44:47] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v23, s0, 60 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v23, s1, 61 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: v_writelane_b32 v23, s2, 62 -; CHECK-NEXT: v_writelane_b32 v0, s6, 2 -; CHECK-NEXT: v_writelane_b32 v23, s3, 63 -; CHECK-NEXT: v_writelane_b32 v0, s7, 3 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 4 -; CHECK-NEXT: v_writelane_b32 v0, s1, 5 -; CHECK-NEXT: v_writelane_b32 v0, s2, 6 -; CHECK-NEXT: v_writelane_b32 v0, s3, 7 -; CHECK-NEXT: v_writelane_b32 v0, s4, 8 -; CHECK-NEXT: v_writelane_b32 v0, s5, 9 -; CHECK-NEXT: v_writelane_b32 v0, s6, 10 -; CHECK-NEXT: v_writelane_b32 v0, s7, 11 -; CHECK-NEXT: v_writelane_b32 v0, s8, 12 -; CHECK-NEXT: v_writelane_b32 v0, s9, 13 -; CHECK-NEXT: v_writelane_b32 v0, s10, 14 -; CHECK-NEXT: v_writelane_b32 v0, s11, 15 -; CHECK-NEXT: v_writelane_b32 v0, s12, 16 -; CHECK-NEXT: v_writelane_b32 v0, s13, 17 -; CHECK-NEXT: v_writelane_b32 v0, s14, 18 -; CHECK-NEXT: v_writelane_b32 v0, s15, 19 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[54:55] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 20 -; CHECK-NEXT: v_writelane_b32 v0, s1, 21 -; CHECK-NEXT: v_writelane_b32 v0, s2, 22 -; CHECK-NEXT: v_writelane_b32 v0, s3, 23 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 24 -; CHECK-NEXT: v_writelane_b32 v0, s1, 25 -; CHECK-NEXT: v_writelane_b32 v0, s2, 26 -; CHECK-NEXT: v_writelane_b32 v0, s3, 27 -; CHECK-NEXT: v_writelane_b32 v0, s4, 28 -; CHECK-NEXT: v_writelane_b32 v0, s5, 29 -; CHECK-NEXT: v_writelane_b32 v0, s6, 30 -; CHECK-NEXT: v_writelane_b32 v0, s7, 31 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; def s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_writelane_b32 v0, s0, 32 -; CHECK-NEXT: v_writelane_b32 v0, s1, 33 -; CHECK-NEXT: v_writelane_b32 v0, s2, 34 -; CHECK-NEXT: v_writelane_b32 v0, s3, 35 -; CHECK-NEXT: v_writelane_b32 v0, s4, 36 -; CHECK-NEXT: v_writelane_b32 v0, s5, 37 -; CHECK-NEXT: v_writelane_b32 v0, s6, 38 -; CHECK-NEXT: v_writelane_b32 v0, s7, 39 -; CHECK-NEXT: v_writelane_b32 v0, s8, 40 -; CHECK-NEXT: v_writelane_b32 v0, s9, 41 -; CHECK-NEXT: v_writelane_b32 v0, s10, 42 -; CHECK-NEXT: v_writelane_b32 v0, s11, 43 -; CHECK-NEXT: v_writelane_b32 v0, s12, 44 -; CHECK-NEXT: v_writelane_b32 v0, s13, 45 -; CHECK-NEXT: v_writelane_b32 v0, s14, 46 -; CHECK-NEXT: v_writelane_b32 v0, s15, 47 -; CHECK-NEXT: s_cbranch_scc0 .LBB0_2 -; CHECK-NEXT: ; %bb.1: ; %ret -; CHECK-NEXT: ; kill: killed $vgpr23 -; CHECK-NEXT: ; kill: killed $vgpr0 -; CHECK-NEXT: s_endpgm -; CHECK-NEXT: .LBB0_2: ; %bb0 -; CHECK-NEXT: v_readlane_b32 s0, v23, 0 -; CHECK-NEXT: v_readlane_b32 s1, v23, 1 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 2 -; CHECK-NEXT: v_readlane_b32 s1, v23, 3 -; CHECK-NEXT: v_readlane_b32 s2, v23, 4 -; CHECK-NEXT: v_readlane_b32 s3, v23, 5 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 6 -; CHECK-NEXT: v_readlane_b32 s1, v23, 7 -; CHECK-NEXT: v_readlane_b32 s2, v23, 8 -; CHECK-NEXT: v_readlane_b32 s3, v23, 9 -; CHECK-NEXT: v_readlane_b32 s4, v23, 10 -; CHECK-NEXT: v_readlane_b32 s5, v23, 11 -; CHECK-NEXT: v_readlane_b32 s6, v23, 12 -; CHECK-NEXT: v_readlane_b32 s7, v23, 13 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 14 -; CHECK-NEXT: v_readlane_b32 s1, v23, 15 -; CHECK-NEXT: v_readlane_b32 s2, v23, 16 -; CHECK-NEXT: v_readlane_b32 s3, v23, 17 -; CHECK-NEXT: v_readlane_b32 s4, v23, 18 -; CHECK-NEXT: v_readlane_b32 s5, v23, 19 -; CHECK-NEXT: v_readlane_b32 s6, v23, 20 -; CHECK-NEXT: v_readlane_b32 s7, v23, 21 -; CHECK-NEXT: v_readlane_b32 s8, v23, 22 -; CHECK-NEXT: v_readlane_b32 s9, v23, 23 -; CHECK-NEXT: v_readlane_b32 s10, v23, 24 -; CHECK-NEXT: v_readlane_b32 s11, v23, 25 -; CHECK-NEXT: v_readlane_b32 s12, v23, 26 -; CHECK-NEXT: v_readlane_b32 s13, v23, 27 -; CHECK-NEXT: v_readlane_b32 s14, v23, 28 -; CHECK-NEXT: v_readlane_b32 s15, v23, 29 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 30 -; CHECK-NEXT: v_readlane_b32 s1, v23, 31 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:1] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 32 -; CHECK-NEXT: v_readlane_b32 s1, v23, 33 -; CHECK-NEXT: v_readlane_b32 s2, v23, 34 -; CHECK-NEXT: v_readlane_b32 s3, v23, 35 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 36 -; CHECK-NEXT: v_readlane_b32 s1, v23, 37 -; CHECK-NEXT: v_readlane_b32 s2, v23, 38 -; CHECK-NEXT: v_readlane_b32 s3, v23, 39 -; CHECK-NEXT: v_readlane_b32 s4, v23, 40 -; CHECK-NEXT: v_readlane_b32 s5, v23, 41 -; CHECK-NEXT: v_readlane_b32 s6, v23, 42 -; CHECK-NEXT: v_readlane_b32 s7, v23, 43 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 44 -; CHECK-NEXT: v_readlane_b32 s1, v23, 45 -; CHECK-NEXT: v_readlane_b32 s2, v23, 46 -; CHECK-NEXT: v_readlane_b32 s3, v23, 47 -; CHECK-NEXT: v_readlane_b32 s4, v23, 48 -; CHECK-NEXT: v_readlane_b32 s5, v23, 49 -; CHECK-NEXT: v_readlane_b32 s6, v23, 50 -; CHECK-NEXT: v_readlane_b32 s7, v23, 51 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[16:31] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[52:53] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[48:51] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[36:43] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s8, v23, 52 -; CHECK-NEXT: v_readlane_b32 s9, v23, 53 -; CHECK-NEXT: v_readlane_b32 s10, v23, 54 -; CHECK-NEXT: v_readlane_b32 s11, v23, 55 -; CHECK-NEXT: v_readlane_b32 s12, v23, 56 -; CHECK-NEXT: v_readlane_b32 s13, v23, 57 -; CHECK-NEXT: v_readlane_b32 s14, v23, 58 -; CHECK-NEXT: v_readlane_b32 s15, v23, 59 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v23, 60 -; CHECK-NEXT: v_readlane_b32 s1, v23, 61 -; CHECK-NEXT: v_readlane_b32 s2, v23, 62 -; CHECK-NEXT: v_readlane_b32 s3, v23, 63 -; CHECK-NEXT: v_readlane_b32 s4, v0, 0 -; CHECK-NEXT: v_readlane_b32 s5, v0, 1 -; CHECK-NEXT: v_readlane_b32 s6, v0, 2 -; CHECK-NEXT: v_readlane_b32 s7, v0, 3 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[34:35] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[44:47] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 4 -; CHECK-NEXT: v_readlane_b32 s1, v0, 5 -; CHECK-NEXT: v_readlane_b32 s2, v0, 6 -; CHECK-NEXT: v_readlane_b32 s3, v0, 7 -; CHECK-NEXT: v_readlane_b32 s4, v0, 8 -; CHECK-NEXT: v_readlane_b32 s5, v0, 9 -; CHECK-NEXT: v_readlane_b32 s6, v0, 10 -; CHECK-NEXT: v_readlane_b32 s7, v0, 11 -; CHECK-NEXT: v_readlane_b32 s8, v0, 12 -; CHECK-NEXT: v_readlane_b32 s9, v0, 13 -; CHECK-NEXT: v_readlane_b32 s10, v0, 14 -; CHECK-NEXT: v_readlane_b32 s11, v0, 15 -; CHECK-NEXT: v_readlane_b32 s12, v0, 16 -; CHECK-NEXT: v_readlane_b32 s13, v0, 17 -; CHECK-NEXT: v_readlane_b32 s14, v0, 18 -; CHECK-NEXT: v_readlane_b32 s15, v0, 19 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 20 -; CHECK-NEXT: v_readlane_b32 s1, v0, 21 -; CHECK-NEXT: v_readlane_b32 s2, v0, 22 -; CHECK-NEXT: v_readlane_b32 s3, v0, 23 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[54:55] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:3] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 24 -; CHECK-NEXT: v_readlane_b32 s1, v0, 25 -; CHECK-NEXT: v_readlane_b32 s2, v0, 26 -; CHECK-NEXT: v_readlane_b32 s3, v0, 27 -; CHECK-NEXT: v_readlane_b32 s4, v0, 28 -; CHECK-NEXT: v_readlane_b32 s5, v0, 29 -; CHECK-NEXT: v_readlane_b32 s6, v0, 30 -; CHECK-NEXT: v_readlane_b32 s7, v0, 31 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:7] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: v_readlane_b32 s0, v0, 32 -; CHECK-NEXT: v_readlane_b32 s1, v0, 33 -; CHECK-NEXT: v_readlane_b32 s2, v0, 34 -; CHECK-NEXT: v_readlane_b32 s3, v0, 35 -; CHECK-NEXT: v_readlane_b32 s4, v0, 36 -; CHECK-NEXT: v_readlane_b32 s5, v0, 37 -; CHECK-NEXT: v_readlane_b32 s6, v0, 38 -; CHECK-NEXT: v_readlane_b32 s7, v0, 39 -; CHECK-NEXT: v_readlane_b32 s8, v0, 40 -; CHECK-NEXT: v_readlane_b32 s9, v0, 41 -; CHECK-NEXT: v_readlane_b32 s10, v0, 42 -; CHECK-NEXT: v_readlane_b32 s11, v0, 43 -; CHECK-NEXT: v_readlane_b32 s12, v0, 44 -; CHECK-NEXT: v_readlane_b32 s13, v0, 45 -; CHECK-NEXT: v_readlane_b32 s14, v0, 46 -; CHECK-NEXT: v_readlane_b32 s15, v0, 47 -; CHECK-NEXT: ;;#ASMSTART -; CHECK-NEXT: ; use s[0:15] -; CHECK-NEXT: ;;#ASMEND -; CHECK-NEXT: ; kill: killed $vgpr23 -; CHECK-NEXT: ; kill: killed $vgpr0 -; CHECK-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 call void asm sideeffect "", "~{v[16:19]}"() #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-regalloc-flags.ll @@ -21,7 +21,6 @@ ; DEFAULT-NEXT: Virtual Register Map ; DEFAULT-NEXT: Live Register Matrix ; DEFAULT-NEXT: Greedy Register Allocator -; DEFAULT-NEXT: SI Lower WWM Copies ; DEFAULT-NEXT: GCN NSA Reassign ; DEFAULT-NEXT: Virtual Register Rewriter ; DEFAULT-NEXT: Stack Slot Coloring @@ -29,7 +28,6 @@ ; O0: Fast Register Allocator ; O0-NEXT: SI lower SGPR spill instructions ; O0-NEXT: Fast Register Allocator -; O0-NEXT: SI Lower WWM Copies ; O0-NEXT: SI Fix VGPR copies @@ -51,7 +49,6 @@ ; BASIC-DEFAULT-NEXT: Lazy Machine Block Frequency Analysis ; BASIC-DEFAULT-NEXT: Machine Optimization Remark Emitter ; BASIC-DEFAULT-NEXT: Greedy Register Allocator -; BASIC-DEFAULT-NEXT: SI Lower WWM Copies ; BASIC-DEFAULT-NEXT: GCN NSA Reassign ; BASIC-DEFAULT-NEXT: Virtual Register Rewriter ; BASIC-DEFAULT-NEXT: Stack Slot Coloring @@ -64,7 +61,6 @@ ; DEFAULT-BASIC-NEXT: Virtual Register Map ; DEFAULT-BASIC-NEXT: Live Register Matrix ; DEFAULT-BASIC-NEXT: Basic Register Allocator -; DEFAULT-BASIC-NEXT: SI Lower WWM Copies ; DEFAULT-BASIC-NEXT: GCN NSA Reassign ; DEFAULT-BASIC-NEXT: Virtual Register Rewriter ; DEFAULT-BASIC-NEXT: Stack Slot Coloring @@ -83,7 +79,6 @@ ; BASIC-BASIC-NEXT: Virtual Register Map ; BASIC-BASIC-NEXT: Live Register Matrix ; BASIC-BASIC-NEXT: Basic Register Allocator -; BASIC-BASIC-NEXT: SI Lower WWM Copies ; BASIC-BASIC-NEXT: GCN NSA Reassign ; BASIC-BASIC-NEXT: Virtual Register Rewriter ; BASIC-BASIC-NEXT: Stack Slot Coloring diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-dead-frame-in-dbg-value.mir @@ -1,5 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck -check-prefix=SGPR_SPILL %s -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs --start-before=si-lower-sgpr-spills --stop-after=prologepilog -o - %s | FileCheck -check-prefix=PEI %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, replace the dead frame index in the DBG_VALUE instruction with reg 0. # Otherwise, the test would crash during PEI while trying to replace the dead frame index. @@ -40,33 +39,13 @@ workGroupIDX: { reg: '$sgpr8' } privateSegmentWaveByteOffset: { reg: '$sgpr9' } body: | - ; SGPR_SPILL-LABEL: name: test - ; SGPR_SPILL: bb.0: - ; SGPR_SPILL-NEXT: successors: %bb.1(0x80000000) - ; SGPR_SPILL-NEXT: {{ $}} - ; SGPR_SPILL-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; SGPR_SPILL-NEXT: renamable $sgpr10 = IMPLICIT_DEF - ; SGPR_SPILL-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] - ; SGPR_SPILL-NEXT: DBG_VALUE $noreg, 0 - ; SGPR_SPILL-NEXT: {{ $}} - ; SGPR_SPILL-NEXT: bb.1: - ; SGPR_SPILL-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0 - ; SGPR_SPILL-NEXT: KILL [[V_WRITELANE_B32_]] - ; SGPR_SPILL-NEXT: S_ENDPGM 0 - ; PEI-LABEL: name: test - ; PEI: bb.0: - ; PEI-NEXT: successors: %bb.1(0x80000000) - ; PEI-NEXT: {{ $}} - ; PEI-NEXT: renamable $vgpr0 = IMPLICIT_DEF - ; PEI-NEXT: renamable $sgpr10 = IMPLICIT_DEF - ; PEI-NEXT: renamable $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, killed $vgpr0 - ; PEI-NEXT: {{ $}} - ; PEI-NEXT: bb.1: - ; PEI-NEXT: liveins: $vgpr0 - ; PEI-NEXT: {{ $}} - ; PEI-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 0 - ; PEI-NEXT: KILL killed renamable $vgpr0 - ; PEI-NEXT: S_ENDPGM 0 + ; CHECK-LABEL: name: test + ; CHECK: bb.0: + ; CHECK: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 0, $vgpr0 + ; CHECK: DBG_VALUE $noreg, 0 + ; CHECK: bb.1: + ; CHECK: $sgpr10 = V_READLANE_B32 $vgpr0, 0 + ; CHECK: S_ENDPGM 0 bb.0: renamable $sgpr10 = IMPLICIT_DEF SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-fi-skip-processing-stack-arg-dbg-value.mir @@ -1,4 +1,4 @@ -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=true -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s # After handling the SGPR spill to VGPR in SILowerSGPRSpills pass, we replace the dead frame index in the DBG_VALUE instruction with reg 0. # Skip looking for frame indices in the debug value instruction for incoming arguments passed via stack. The test would crash otherwise. @@ -45,7 +45,7 @@ body: | ; CHECK-LABEL: name: test ; CHECK: bb.0: - ; CHECK: DBG_VALUE + ; CHECK: DBG_VALUE $noreg, 0 bb.0: renamable $sgpr10 = IMPLICIT_DEF SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-no-vgprs.ll @@ -9,17 +9,9 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_add_u32 s0, s0, s15 ; GCN-NEXT: s_addc_u32 s1, s1, 0 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: s_load_dword s4, s[8:9], 0x2 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: ;;#ASMSTART @@ -31,179 +23,179 @@ ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) -; GCN-NEXT: v_writelane_b32 v1, s8, 0 -; GCN-NEXT: v_writelane_b32 v1, s9, 1 -; GCN-NEXT: v_writelane_b32 v1, s10, 2 -; GCN-NEXT: v_writelane_b32 v1, s11, 3 -; GCN-NEXT: v_writelane_b32 v1, s12, 4 -; GCN-NEXT: v_writelane_b32 v1, s13, 5 -; GCN-NEXT: v_writelane_b32 v1, s14, 6 -; GCN-NEXT: v_writelane_b32 v1, s15, 7 -; GCN-NEXT: v_writelane_b32 v1, s16, 8 -; GCN-NEXT: v_writelane_b32 v1, s17, 9 -; GCN-NEXT: v_writelane_b32 v1, s18, 10 -; GCN-NEXT: v_writelane_b32 v1, s19, 11 -; GCN-NEXT: v_writelane_b32 v1, s20, 12 -; GCN-NEXT: v_writelane_b32 v1, s21, 13 -; GCN-NEXT: v_writelane_b32 v1, s22, 14 -; GCN-NEXT: v_writelane_b32 v1, s23, 15 +; GCN-NEXT: v_writelane_b32 v23, s8, 0 +; GCN-NEXT: v_writelane_b32 v23, s9, 1 +; GCN-NEXT: v_writelane_b32 v23, s10, 2 +; GCN-NEXT: v_writelane_b32 v23, s11, 3 +; GCN-NEXT: v_writelane_b32 v23, s12, 4 +; GCN-NEXT: v_writelane_b32 v23, s13, 5 +; GCN-NEXT: v_writelane_b32 v23, s14, 6 +; GCN-NEXT: v_writelane_b32 v23, s15, 7 +; GCN-NEXT: v_writelane_b32 v23, s16, 8 +; GCN-NEXT: v_writelane_b32 v23, s17, 9 +; GCN-NEXT: v_writelane_b32 v23, s18, 10 +; GCN-NEXT: v_writelane_b32 v23, s19, 11 +; GCN-NEXT: v_writelane_b32 v23, s20, 12 +; GCN-NEXT: v_writelane_b32 v23, s21, 13 +; GCN-NEXT: v_writelane_b32 v23, s22, 14 +; GCN-NEXT: v_writelane_b32 v23, s23, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s8, 16 -; GCN-NEXT: v_writelane_b32 v1, s9, 17 -; GCN-NEXT: v_writelane_b32 v1, s10, 18 -; GCN-NEXT: v_writelane_b32 v1, s11, 19 -; GCN-NEXT: v_writelane_b32 v1, s12, 20 -; GCN-NEXT: v_writelane_b32 v1, s13, 21 -; GCN-NEXT: v_writelane_b32 v1, s14, 22 -; GCN-NEXT: v_writelane_b32 v1, s15, 23 -; GCN-NEXT: v_writelane_b32 v1, s16, 24 -; GCN-NEXT: v_writelane_b32 v1, s17, 25 -; GCN-NEXT: v_writelane_b32 v1, s18, 26 -; GCN-NEXT: v_writelane_b32 v1, s19, 27 -; GCN-NEXT: v_writelane_b32 v1, s20, 28 -; GCN-NEXT: v_writelane_b32 v1, s21, 29 -; GCN-NEXT: v_writelane_b32 v1, s22, 30 -; GCN-NEXT: v_writelane_b32 v1, s23, 31 +; GCN-NEXT: v_writelane_b32 v23, s8, 16 +; GCN-NEXT: v_writelane_b32 v23, s9, 17 +; GCN-NEXT: v_writelane_b32 v23, s10, 18 +; GCN-NEXT: v_writelane_b32 v23, s11, 19 +; GCN-NEXT: v_writelane_b32 v23, s12, 20 +; GCN-NEXT: v_writelane_b32 v23, s13, 21 +; GCN-NEXT: v_writelane_b32 v23, s14, 22 +; GCN-NEXT: v_writelane_b32 v23, s15, 23 +; GCN-NEXT: v_writelane_b32 v23, s16, 24 +; GCN-NEXT: v_writelane_b32 v23, s17, 25 +; GCN-NEXT: v_writelane_b32 v23, s18, 26 +; GCN-NEXT: v_writelane_b32 v23, s19, 27 +; GCN-NEXT: v_writelane_b32 v23, s20, 28 +; GCN-NEXT: v_writelane_b32 v23, s21, 29 +; GCN-NEXT: v_writelane_b32 v23, s22, 30 +; GCN-NEXT: v_writelane_b32 v23, s23, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s8, 32 -; GCN-NEXT: v_writelane_b32 v1, s9, 33 -; GCN-NEXT: v_writelane_b32 v1, s10, 34 -; GCN-NEXT: v_writelane_b32 v1, s11, 35 -; GCN-NEXT: v_writelane_b32 v1, s12, 36 -; GCN-NEXT: v_writelane_b32 v1, s13, 37 -; GCN-NEXT: v_writelane_b32 v1, s14, 38 -; GCN-NEXT: v_writelane_b32 v1, s15, 39 -; GCN-NEXT: v_writelane_b32 v1, s16, 40 -; GCN-NEXT: v_writelane_b32 v1, s17, 41 -; GCN-NEXT: v_writelane_b32 v1, s18, 42 -; GCN-NEXT: v_writelane_b32 v1, s19, 43 -; GCN-NEXT: v_writelane_b32 v1, s20, 44 -; GCN-NEXT: v_writelane_b32 v1, s21, 45 -; GCN-NEXT: v_writelane_b32 v1, s22, 46 -; GCN-NEXT: v_writelane_b32 v1, s23, 47 +; GCN-NEXT: v_writelane_b32 v23, s8, 32 +; GCN-NEXT: v_writelane_b32 v23, s9, 33 +; GCN-NEXT: v_writelane_b32 v23, s10, 34 +; GCN-NEXT: v_writelane_b32 v23, s11, 35 +; GCN-NEXT: v_writelane_b32 v23, s12, 36 +; GCN-NEXT: v_writelane_b32 v23, s13, 37 +; GCN-NEXT: v_writelane_b32 v23, s14, 38 +; GCN-NEXT: v_writelane_b32 v23, s15, 39 +; GCN-NEXT: v_writelane_b32 v23, s16, 40 +; GCN-NEXT: v_writelane_b32 v23, s17, 41 +; GCN-NEXT: v_writelane_b32 v23, s18, 42 +; GCN-NEXT: v_writelane_b32 v23, s19, 43 +; GCN-NEXT: v_writelane_b32 v23, s20, 44 +; GCN-NEXT: v_writelane_b32 v23, s21, 45 +; GCN-NEXT: v_writelane_b32 v23, s22, 46 +; GCN-NEXT: v_writelane_b32 v23, s23, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[8:23] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v1, s8, 48 -; GCN-NEXT: v_writelane_b32 v1, s9, 49 -; GCN-NEXT: v_writelane_b32 v1, s10, 50 -; GCN-NEXT: v_writelane_b32 v1, s11, 51 -; GCN-NEXT: v_writelane_b32 v1, s12, 52 -; GCN-NEXT: v_writelane_b32 v1, s13, 53 -; GCN-NEXT: v_writelane_b32 v1, s14, 54 -; GCN-NEXT: v_writelane_b32 v1, s15, 55 -; GCN-NEXT: v_writelane_b32 v1, s16, 56 -; GCN-NEXT: v_writelane_b32 v1, s17, 57 -; GCN-NEXT: v_writelane_b32 v1, s18, 58 -; GCN-NEXT: v_writelane_b32 v1, s19, 59 -; GCN-NEXT: v_writelane_b32 v1, s20, 60 -; GCN-NEXT: v_writelane_b32 v1, s21, 61 -; GCN-NEXT: v_writelane_b32 v1, s22, 62 -; GCN-NEXT: v_writelane_b32 v1, s23, 63 -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: v_writelane_b32 v23, s8, 48 +; GCN-NEXT: v_writelane_b32 v23, s9, 49 +; GCN-NEXT: v_writelane_b32 v23, s10, 50 +; GCN-NEXT: v_writelane_b32 v23, s11, 51 +; GCN-NEXT: v_writelane_b32 v23, s12, 52 +; GCN-NEXT: v_writelane_b32 v23, s13, 53 +; GCN-NEXT: v_writelane_b32 v23, s14, 54 +; GCN-NEXT: v_writelane_b32 v23, s15, 55 +; GCN-NEXT: v_writelane_b32 v23, s16, 56 +; GCN-NEXT: v_writelane_b32 v23, s17, 57 +; GCN-NEXT: v_writelane_b32 v23, s18, 58 +; GCN-NEXT: v_writelane_b32 v23, s19, 59 +; GCN-NEXT: v_writelane_b32 v23, s20, 60 +; GCN-NEXT: v_writelane_b32 v23, s21, 61 +; GCN-NEXT: v_writelane_b32 v23, s22, 62 +; GCN-NEXT: v_writelane_b32 v23, s23, 63 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s[6:7] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(1) +; GCN-NEXT: s_mov_b64 s[8:9], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; GCN-NEXT: v_writelane_b32 v0, s6, 0 ; GCN-NEXT: v_writelane_b32 v0, s7, 1 -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[24:25] +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: s_mov_b32 s5, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_cmp_lg_u32 s4, s5 ; GCN-NEXT: s_cbranch_scc1 .LBB0_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s4, v1, 0 -; GCN-NEXT: v_readlane_b32 s5, v1, 1 -; GCN-NEXT: v_readlane_b32 s6, v1, 2 -; GCN-NEXT: v_readlane_b32 s7, v1, 3 -; GCN-NEXT: v_readlane_b32 s8, v1, 4 -; GCN-NEXT: v_readlane_b32 s9, v1, 5 -; GCN-NEXT: v_readlane_b32 s10, v1, 6 -; GCN-NEXT: v_readlane_b32 s11, v1, 7 -; GCN-NEXT: v_readlane_b32 s12, v1, 8 -; GCN-NEXT: v_readlane_b32 s13, v1, 9 -; GCN-NEXT: v_readlane_b32 s14, v1, 10 -; GCN-NEXT: v_readlane_b32 s15, v1, 11 -; GCN-NEXT: v_readlane_b32 s16, v1, 12 -; GCN-NEXT: v_readlane_b32 s17, v1, 13 -; GCN-NEXT: v_readlane_b32 s18, v1, 14 -; GCN-NEXT: v_readlane_b32 s19, v1, 15 +; GCN-NEXT: v_readlane_b32 s4, v23, 0 +; GCN-NEXT: v_readlane_b32 s5, v23, 1 +; GCN-NEXT: v_readlane_b32 s6, v23, 2 +; GCN-NEXT: v_readlane_b32 s7, v23, 3 +; GCN-NEXT: v_readlane_b32 s8, v23, 4 +; GCN-NEXT: v_readlane_b32 s9, v23, 5 +; GCN-NEXT: v_readlane_b32 s10, v23, 6 +; GCN-NEXT: v_readlane_b32 s11, v23, 7 +; GCN-NEXT: v_readlane_b32 s12, v23, 8 +; GCN-NEXT: v_readlane_b32 s13, v23, 9 +; GCN-NEXT: v_readlane_b32 s14, v23, 10 +; GCN-NEXT: v_readlane_b32 s15, v23, 11 +; GCN-NEXT: v_readlane_b32 s16, v23, 12 +; GCN-NEXT: v_readlane_b32 s17, v23, 13 +; GCN-NEXT: v_readlane_b32 s18, v23, 14 +; GCN-NEXT: v_readlane_b32 s19, v23, 15 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v1, 16 -; GCN-NEXT: v_readlane_b32 s5, v1, 17 -; GCN-NEXT: v_readlane_b32 s6, v1, 18 -; GCN-NEXT: v_readlane_b32 s7, v1, 19 -; GCN-NEXT: v_readlane_b32 s8, v1, 20 -; GCN-NEXT: v_readlane_b32 s9, v1, 21 -; GCN-NEXT: v_readlane_b32 s10, v1, 22 -; GCN-NEXT: v_readlane_b32 s11, v1, 23 -; GCN-NEXT: v_readlane_b32 s12, v1, 24 -; GCN-NEXT: v_readlane_b32 s13, v1, 25 -; GCN-NEXT: v_readlane_b32 s14, v1, 26 -; GCN-NEXT: v_readlane_b32 s15, v1, 27 -; GCN-NEXT: v_readlane_b32 s16, v1, 28 -; GCN-NEXT: v_readlane_b32 s17, v1, 29 -; GCN-NEXT: v_readlane_b32 s18, v1, 30 -; GCN-NEXT: v_readlane_b32 s19, v1, 31 +; GCN-NEXT: v_readlane_b32 s4, v23, 16 +; GCN-NEXT: v_readlane_b32 s5, v23, 17 +; GCN-NEXT: v_readlane_b32 s6, v23, 18 +; GCN-NEXT: v_readlane_b32 s7, v23, 19 +; GCN-NEXT: v_readlane_b32 s8, v23, 20 +; GCN-NEXT: v_readlane_b32 s9, v23, 21 +; GCN-NEXT: v_readlane_b32 s10, v23, 22 +; GCN-NEXT: v_readlane_b32 s11, v23, 23 +; GCN-NEXT: v_readlane_b32 s12, v23, 24 +; GCN-NEXT: v_readlane_b32 s13, v23, 25 +; GCN-NEXT: v_readlane_b32 s14, v23, 26 +; GCN-NEXT: v_readlane_b32 s15, v23, 27 +; GCN-NEXT: v_readlane_b32 s16, v23, 28 +; GCN-NEXT: v_readlane_b32 s17, v23, 29 +; GCN-NEXT: v_readlane_b32 s18, v23, 30 +; GCN-NEXT: v_readlane_b32 s19, v23, 31 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s4, v1, 32 -; GCN-NEXT: v_readlane_b32 s5, v1, 33 -; GCN-NEXT: v_readlane_b32 s6, v1, 34 -; GCN-NEXT: v_readlane_b32 s7, v1, 35 -; GCN-NEXT: v_readlane_b32 s8, v1, 36 -; GCN-NEXT: v_readlane_b32 s9, v1, 37 -; GCN-NEXT: v_readlane_b32 s10, v1, 38 -; GCN-NEXT: v_readlane_b32 s11, v1, 39 -; GCN-NEXT: v_readlane_b32 s12, v1, 40 -; GCN-NEXT: v_readlane_b32 s13, v1, 41 -; GCN-NEXT: v_readlane_b32 s14, v1, 42 -; GCN-NEXT: v_readlane_b32 s15, v1, 43 -; GCN-NEXT: v_readlane_b32 s16, v1, 44 -; GCN-NEXT: v_readlane_b32 s17, v1, 45 -; GCN-NEXT: v_readlane_b32 s18, v1, 46 -; GCN-NEXT: v_readlane_b32 s19, v1, 47 +; GCN-NEXT: v_readlane_b32 s4, v23, 32 +; GCN-NEXT: v_readlane_b32 s5, v23, 33 +; GCN-NEXT: v_readlane_b32 s6, v23, 34 +; GCN-NEXT: v_readlane_b32 s7, v23, 35 +; GCN-NEXT: v_readlane_b32 s8, v23, 36 +; GCN-NEXT: v_readlane_b32 s9, v23, 37 +; GCN-NEXT: v_readlane_b32 s10, v23, 38 +; GCN-NEXT: v_readlane_b32 s11, v23, 39 +; GCN-NEXT: v_readlane_b32 s12, v23, 40 +; GCN-NEXT: v_readlane_b32 s13, v23, 41 +; GCN-NEXT: v_readlane_b32 s14, v23, 42 +; GCN-NEXT: v_readlane_b32 s15, v23, 43 +; GCN-NEXT: v_readlane_b32 s16, v23, 44 +; GCN-NEXT: v_readlane_b32 s17, v23, 45 +; GCN-NEXT: v_readlane_b32 s18, v23, 46 +; GCN-NEXT: v_readlane_b32 s19, v23, 47 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[4:19] ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_readlane_b32 s8, v1, 48 -; GCN-NEXT: v_readlane_b32 s9, v1, 49 -; GCN-NEXT: v_readlane_b32 s10, v1, 50 -; GCN-NEXT: v_readlane_b32 s11, v1, 51 -; GCN-NEXT: v_readlane_b32 s12, v1, 52 -; GCN-NEXT: v_readlane_b32 s13, v1, 53 -; GCN-NEXT: v_readlane_b32 s14, v1, 54 -; GCN-NEXT: v_readlane_b32 s15, v1, 55 -; GCN-NEXT: v_readlane_b32 s16, v1, 56 -; GCN-NEXT: v_readlane_b32 s17, v1, 57 -; GCN-NEXT: v_readlane_b32 s18, v1, 58 -; GCN-NEXT: v_readlane_b32 s19, v1, 59 -; GCN-NEXT: v_readlane_b32 s20, v1, 60 -; GCN-NEXT: v_readlane_b32 s21, v1, 61 -; GCN-NEXT: v_readlane_b32 s22, v1, 62 -; GCN-NEXT: v_readlane_b32 s23, v1, 63 +; GCN-NEXT: v_readlane_b32 s8, v23, 48 +; GCN-NEXT: v_readlane_b32 s9, v23, 49 +; GCN-NEXT: v_readlane_b32 s10, v23, 50 +; GCN-NEXT: v_readlane_b32 s11, v23, 51 +; GCN-NEXT: v_readlane_b32 s12, v23, 52 +; GCN-NEXT: v_readlane_b32 s13, v23, 53 +; GCN-NEXT: v_readlane_b32 s14, v23, 54 +; GCN-NEXT: v_readlane_b32 s15, v23, 55 +; GCN-NEXT: v_readlane_b32 s16, v23, 56 +; GCN-NEXT: v_readlane_b32 s17, v23, 57 +; GCN-NEXT: v_readlane_b32 s18, v23, 58 +; GCN-NEXT: v_readlane_b32 s19, v23, 59 +; GCN-NEXT: v_readlane_b32 s20, v23, 60 +; GCN-NEXT: v_readlane_b32 s21, v23, 61 +; GCN-NEXT: v_readlane_b32 s22, v23, 62 +; GCN-NEXT: v_readlane_b32 s23, v23, 63 +; GCN-NEXT: s_mov_b64 s[6:7], exec +; GCN-NEXT: s_mov_b64 exec, 3 +; GCN-NEXT: buffer_store_dword v0, off, s[0:3], 0 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: v_readlane_b32 s4, v0, 0 ; GCN-NEXT: v_readlane_b32 s5, v0, 1 +; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 +; GCN-NEXT: s_waitcnt vmcnt(0) +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s[8:23] ; GCN-NEXT: ;;#ASMEND @@ -211,14 +203,6 @@ ; GCN-NEXT: ; use s[4:5] ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB0_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: s_or_saveexec_b64 s[24:25], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[24:25] -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void asm sideeffect "", "~{v[0:7]}" () #0 call void asm sideeffect "", "~{v[8:15]}" () #0 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-partially-undef.mir @@ -20,11 +20,10 @@ liveins: $sgpr4 ; CHECK-LABEL: name: sgpr_spill_s64_undef_high32 - ; CHECK: liveins: $sgpr4 + ; CHECK: liveins: $sgpr4, $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... @@ -46,11 +45,10 @@ liveins: $sgpr5 ; CHECK-LABEL: name: sgpr_spill_s64_undef_low32 - ; CHECK: liveins: $sgpr5 + ; CHECK: liveins: $sgpr5, $vgpr0 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; CHECK-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 - ; CHECK-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]], implicit $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5, implicit $sgpr4_sgpr5 + ; CHECK-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0, implicit $sgpr4_sgpr5 SI_SPILL_S64_SAVE renamable $sgpr4_sgpr5, %stack.0, implicit $exec, implicit $sgpr96_sgpr97_sgpr98_sgpr99, implicit $sgpr32 :: (store (s64) into %stack.0, align 4, addrspace 5) ... diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-update-only-slot-indexes.ll @@ -13,40 +13,32 @@ ; GCN-NEXT: s_mov_b32 s37, SCRATCH_RSRC_DWORD1 ; GCN-NEXT: s_mov_b32 s38, -1 ; GCN-NEXT: s_mov_b32 s39, 0xe00000 -; GCN-NEXT: ; implicit-def: $vgpr3 +; GCN-NEXT: v_writelane_b32 v40, s4, 0 ; GCN-NEXT: s_add_u32 s36, s36, s11 -; GCN-NEXT: v_writelane_b32 v3, s4, 0 -; GCN-NEXT: s_movk_i32 s32, 0x400 +; GCN-NEXT: v_writelane_b32 v40, s5, 1 ; GCN-NEXT: s_addc_u32 s37, s37, 0 -; GCN-NEXT: s_mov_b32 s14, s10 +; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] +; GCN-NEXT: v_readlane_b32 s0, v40, 0 ; GCN-NEXT: s_mov_b32 s13, s9 ; GCN-NEXT: s_mov_b32 s12, s8 -; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] -; GCN-NEXT: v_writelane_b32 v3, s5, 1 -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_store_dword v3, off, s[36:39], 0 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: s_mov_b64 s[4:5], s[0:1] -; GCN-NEXT: v_readlane_b32 s0, v3, 0 -; GCN-NEXT: v_readlane_b32 s1, v3, 1 +; GCN-NEXT: v_readlane_b32 s1, v40, 1 ; GCN-NEXT: s_add_u32 s8, s0, 36 ; GCN-NEXT: s_addc_u32 s9, s1, 0 ; GCN-NEXT: s_getpc_b64 s[0:1] ; GCN-NEXT: s_add_u32 s0, s0, foo@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s1, s1, foo@gotpcrel32@hi+12 ; GCN-NEXT: s_load_dwordx2 s[16:17], s[0:1], 0x0 +; GCN-NEXT: s_mov_b32 s14, s10 +; GCN-NEXT: s_mov_b64 s[10:11], s[6:7] ; GCN-NEXT: s_mov_b64 s[6:7], s[2:3] ; GCN-NEXT: v_lshlrev_b32_e32 v2, 20, v2 ; GCN-NEXT: v_lshlrev_b32_e32 v1, 10, v1 ; GCN-NEXT: s_mov_b64 s[0:1], s[36:37] ; GCN-NEXT: v_or3_b32 v31, v0, v1, v2 ; GCN-NEXT: s_mov_b64 s[2:3], s[38:39] +; GCN-NEXT: s_mov_b32 s32, 0 ; GCN-NEXT: s_waitcnt lgkmcnt(0) ; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[36:39], 0 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[34:35] -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: s_endpgm call void @foo() ret void diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir --- a/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spill-vmem-large-frame.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -start-before=si-lower-sgpr-spills -stop-after=prologepilog -o - %s | FileCheck %s +# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -amdgpu-spill-sgpr-to-vgpr=false -verify-machineinstrs -run-pass=si-lower-sgpr-spills,prologepilog -o - %s | FileCheck %s # Check that we allocate 2 emergency stack slots if we're spilling # SGPRs to memory and potentially have an offset larger than fits in @@ -29,7 +29,7 @@ ; CHECK-NEXT: $sgpr4_sgpr5 = S_MOV_B64 $exec ; CHECK-NEXT: $exec = S_MOV_B64 1, implicit-def $vgpr1 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.2, addrspace 5) - ; CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 $sgpr10, 0, undef $vgpr1 + ; CHECK-NEXT: $vgpr1 = V_WRITELANE_B32 killed $sgpr10, 0, undef $vgpr1 ; CHECK-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0, addrspace 5) ; CHECK-NEXT: $vgpr1 = BUFFER_LOAD_DWORD_OFFSET $sgpr0_sgpr1_sgpr2_sgpr3, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.2, addrspace 5) ; CHECK-NEXT: $exec = S_MOV_B64 killed $sgpr4_sgpr5, implicit killed $vgpr1 diff --git a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll --- a/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll +++ b/llvm/test/CodeGen/AMDGPU/sgpr-spills-split-regalloc.ll @@ -16,10 +16,10 @@ ; GCN-LABEL: spill_sgpr_with_no_lower_vgpr_available: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s24, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Spill @@ -135,13 +135,13 @@ ; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v255, s30, 0 ; GCN-NEXT: v_writelane_b32 v255, s31, 1 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:444 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, child_function@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, child_function@gotpcrel32@hi+12 @@ -266,10 +266,10 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v255, off, s[0:3], s33 offset:452 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -310,10 +310,10 @@ ; GCN-LABEL: spill_to_lowest_available_vgpr: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s24, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[16:17], -1 -; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[16:17] ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Spill @@ -428,13 +428,13 @@ ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v254, s30, 0 ; GCN-NEXT: v_writelane_b32 v254, s31, 1 -; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v31, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:440 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v31, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_getpc_b64 s[16:17] ; GCN-NEXT: s_add_u32 s16, s16, child_function@gotpcrel32@lo+4 ; GCN-NEXT: s_addc_u32 s17, s17, child_function@gotpcrel32@hi+12 @@ -558,10 +558,10 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:436 ; 4-byte Folded Reload ; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s33 offset:448 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] %alloca = alloca i32, align 4, addrspace(5) @@ -602,8 +602,8 @@ ; GCN-LABEL: spill_sgpr_with_sgpr_uses: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_store_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Spill @@ -715,38 +715,22 @@ ; GCN-NEXT: buffer_store_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v253, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: ; implicit-def: $vgpr0 ; GCN-NEXT: v_mov_b32_e32 v0, 0 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:440 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; def s4 ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_writelane_b32 v0, s4, 0 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[8:9] +; GCN-NEXT: v_writelane_b32 v254, s4, 0 ; GCN-NEXT: s_cbranch_scc1 .LBB3_2 ; GCN-NEXT: ; %bb.1: ; %bb0 -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s4, v0, 0 +; GCN-NEXT: v_readlane_b32 s4, v254, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ; use s4 ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: .LBB3_2: ; %ret -; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: ; kill: killed $vgpr0 ; GCN-NEXT: buffer_load_dword v253, off, s[0:3], s32 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v252, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v251, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload @@ -857,8 +841,8 @@ ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:428 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:432 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:436 ; 4-byte Folded Reload -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: s_or_saveexec_b64 s[4:5], -1 +; GCN-NEXT: buffer_load_dword v254, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1183,7 +1167,7 @@ ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Spill @@ -1315,17 +1299,17 @@ ; GCN-NEXT: ; implicit-def: $sgpr4_sgpr5 ; GCN-NEXT: flat_load_dwordx4 v[5:8], v[2:3] ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Spill ; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Spill ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:452 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:456 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:460 ; 4-byte Folded Reload ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: s_waitcnt vmcnt(0) @@ -1447,7 +1431,7 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:448 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:464 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[4:5] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] @@ -1528,7 +1512,7 @@ ; GCN-LABEL: spill_sgpr_no_free_vgpr_ipra: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s18, s33 +; GCN-NEXT: s_mov_b32 s24, s33 ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_add_i32 s32, s32, 0x7400 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Spill @@ -1798,7 +1782,7 @@ ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:440 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:444 ; 4-byte Folded Reload ; GCN-NEXT: s_add_i32 s32, s32, 0xffff8c00 -; GCN-NEXT: s_mov_b32 s33, s18 +; GCN-NEXT: s_mov_b32 s33, s24 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void @child_function_ipra() diff --git a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll --- a/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll +++ b/llvm/test/CodeGen/AMDGPU/si-spill-sgpr-stack.ll @@ -6,17 +6,17 @@ ; ALL: s_mov_b32 s[[HI:[0-9]+]], 0xe80000 ; Make sure we are handling hazards correctly. -; SGPR: v_mov_b32_e32 v0, vcc_lo -; SGPR-NEXT: s_or_saveexec_b64 [[EXEC_COPY:s\[[0-9]+:[0-9]+\]]], -1 -; SGPR-NEXT: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; 4-byte Folded Reload -; SGPR-NEXT: s_mov_b64 exec, [[EXEC_COPY]] +; SGPR: buffer_load_dword [[VHI:v[0-9]+]], off, s[{{[0-9]+:[0-9]+}}], 0 offset:4 ; SGPR-NEXT: s_waitcnt vmcnt(0) ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 0 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 1 ; SGPR-NEXT: v_readlane_b32 s{{[0-9]+}}, [[VHI]], 2 ; SGPR-NEXT: v_readlane_b32 s[[HI:[0-9]+]], [[VHI]], 3 -; SGPR-NEXT: ; kill: killed $vgpr1 -; SGPR-NEXT: s_nop 4 +; SGPR-NEXT: buffer_load_dword [[VHI]], off, s[96:99], 0 +; SGPR-NEXT: s_waitcnt vmcnt(0) +; SGPR-NEXT: s_mov_b64 exec, s[4:5] +; SGPR-NEXT: s_nop 1 +; SGPR-NEXT: buffer_store_dword v0, off, s[0:3], 0 ; ALL: s_endpgm define amdgpu_kernel void @test(ptr addrspace(1) %out, i32 %in) { diff --git a/llvm/test/CodeGen/AMDGPU/sibling-call.ll b/llvm/test/CodeGen/AMDGPU/sibling-call.ll --- a/llvm/test/CodeGen/AMDGPU/sibling-call.ll +++ b/llvm/test/CodeGen/AMDGPU/sibling-call.ll @@ -202,9 +202,10 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s{{\[[0-9]+:[0-9]+\]}}, -1 ; GCN-NEXT: buffer_store_dword [[CSRV:v[0-9]+]], off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[CSRV_1:v[0-9]+]], off, s[0:3], s33 offset:12 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN-DAG: s_addk_i32 s32, 0x400 -; GCN: v_writelane_b32 [[CSRV]], [[FP_SCRATCH_COPY]], 2 +; GCN-DAG: s_addk_i32 s32, 0x800 +; GCN: v_writelane_b32 [[CSRV_1]], [[FP_SCRATCH_COPY]], 0 ; GCN-DAG: s_getpc_b64 s[4:5] ; GCN-DAG: s_add_u32 s4, s4, i32_fastcc_i32_i32@gotpcrel32@lo+4 @@ -226,11 +227,12 @@ ; GCN-NEXT: s_addc_u32 s5, s5, sibling_call_i32_fastcc_i32_i32@rel32@hi+12 ; GCN-NEXT: v_readlane_b32 s31, [[CSRV]], 1 ; GCN-NEXT: v_readlane_b32 s30, [[CSRV]], 0 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV]], 2 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[CSRV_1]], 0 ; GCN-NEXT: s_or_saveexec_b64 s[8:9], -1 ; GCN-NEXT: buffer_load_dword [[CSRV]], off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[CSRV_1]], off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[8:9] -; GCN-NEXT: s_addk_i32 s32, 0xfc00 +; GCN-NEXT: s_addk_i32 s32, 0xf800 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN-NEXT: s_setpc_b64 s[4:5] define fastcc i32 @sibling_call_i32_fastcc_i32_i32_other_call(i32 %a, i32 %b, i32 %c) #1 { diff --git a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir --- a/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir +++ b/llvm/test/CodeGen/AMDGPU/snippet-copy-bundle-regression.mir @@ -33,7 +33,6 @@ ; CHECK-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) ; CHECK-NEXT: liveins: $sgpr14, $sgpr15, $sgpr16, $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr1 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr34_sgpr35 = IMPLICIT_DEF ; CHECK-NEXT: dead renamable $vgpr0 = IMPLICIT_DEF ; CHECK-NEXT: renamable $sgpr41 = IMPLICIT_DEF @@ -42,15 +41,7 @@ ; CHECK-NEXT: renamable $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 0, 0 :: (dereferenceable invariant load (s256), align 16, addrspace 4) ; CHECK-NEXT: dead renamable $sgpr4 = S_LOAD_DWORD_IMM renamable $sgpr38_sgpr39, 48, 0 :: (dereferenceable invariant load (s32), align 16, addrspace 4) ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = S_LOAD_DWORDX8_IMM renamable $sgpr38_sgpr39, 56, 0 :: (dereferenceable invariant load (s256), align 8, addrspace 4) - ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr4, 0, killed $vgpr1, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 - ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr5, 1, killed $vgpr1 - ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr6, 2, killed $vgpr1 - ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr7, 3, killed $vgpr1 - ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr8, 4, killed $vgpr1 - ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr9, 5, killed $vgpr1 - ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 $sgpr10, 6, killed $vgpr1 - ; CHECK-NEXT: renamable $vgpr1 = V_WRITELANE_B32 killed $sgpr11, 7, killed $vgpr1, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 - ; CHECK-NEXT: SI_SPILL_WWM_V32_SAVE killed $vgpr1, %stack.1, $sgpr32, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) + ; CHECK-NEXT: SI_SPILL_S256_SAVE killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11, %stack.0, implicit $exec, implicit $sgpr32 :: (store (s256) into %stack.0, align 4, addrspace 5) ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM renamable $sgpr44_sgpr45, 0, 0 :: (invariant load (s64), align 16, addrspace 4) ; CHECK-NEXT: ADJCALLSTACKUP 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 ; CHECK-NEXT: $vgpr1 = COPY renamable $sgpr51 @@ -63,50 +54,30 @@ ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 - ; CHECK-NEXT: $sgpr5 = V_READLANE_B32 $vgpr1, 1 - ; CHECK-NEXT: $sgpr6 = V_READLANE_B32 $vgpr1, 2 - ; CHECK-NEXT: $sgpr7 = V_READLANE_B32 $vgpr1, 3 - ; CHECK-NEXT: $sgpr8 = V_READLANE_B32 $vgpr1, 4 - ; CHECK-NEXT: $sgpr9 = V_READLANE_B32 $vgpr1, 5 - ; CHECK-NEXT: $sgpr10 = V_READLANE_B32 $vgpr1, 6 - ; CHECK-NEXT: $sgpr11 = V_READLANE_B32 $vgpr1, 7 - ; CHECK-NEXT: $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $exec = S_MOV_B64 killed $noreg + ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_BRANCH %bb.3 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.2: ; CHECK-NEXT: successors: %bb.3(0x80000000) ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} - ; CHECK-NEXT: renamable $vgpr1 = SI_SPILL_WWM_V32_RESTORE %stack.1, $sgpr32, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) - ; CHECK-NEXT: $sgpr4 = V_READLANE_B32 $vgpr1, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 - ; CHECK-NEXT: $sgpr5 = V_READLANE_B32 $vgpr1, 1 - ; CHECK-NEXT: $sgpr6 = V_READLANE_B32 $vgpr1, 2 - ; CHECK-NEXT: $sgpr7 = V_READLANE_B32 $vgpr1, 3 - ; CHECK-NEXT: $sgpr8 = V_READLANE_B32 $vgpr1, 4 - ; CHECK-NEXT: $sgpr9 = V_READLANE_B32 $vgpr1, 5 - ; CHECK-NEXT: $sgpr10 = V_READLANE_B32 $vgpr1, 6 - ; CHECK-NEXT: $sgpr11 = V_READLANE_B32 $vgpr1, 7 + ; CHECK-NEXT: renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11 = SI_SPILL_S256_RESTORE %stack.0, implicit $exec, implicit $sgpr32 :: (load (s256) from %stack.0, align 4, addrspace 5) ; CHECK-NEXT: S_CMP_LG_U64 renamable $sgpr4_sgpr5, 0, implicit-def $scc - ; CHECK-NEXT: $noreg = S_OR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec - ; CHECK-NEXT: $exec = S_MOV_B64 killed $noreg ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.3: ; CHECK-NEXT: successors: %bb.5(0x40000000), %bb.4(0x40000000) - ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CBRANCH_VCCZ %bb.5, implicit undef $vcc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.4: ; CHECK-NEXT: successors: %bb.5(0x80000000) - ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000003F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: S_CMP_EQ_U32 renamable $sgpr8, 0, implicit-def $scc ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: bb.5: - ; CHECK-NEXT: liveins: $vgpr1, $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 + ; CHECK-NEXT: liveins: $sgpr34_sgpr35, $sgpr36_sgpr37, $sgpr38_sgpr39, $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11:0x00000000000000F0, $sgpr44_sgpr45_sgpr46_sgpr47_sgpr48_sgpr49_sgpr50_sgpr51:0x000000000000FC00 ; CHECK-NEXT: {{ $}} ; CHECK-NEXT: dead renamable $sgpr4_sgpr5 = S_LOAD_DWORDX2_IMM killed renamable $sgpr38_sgpr39, 40, 0 :: (dereferenceable invariant load (s64), addrspace 4) ; CHECK-NEXT: GLOBAL_STORE_DWORD_SADDR undef renamable $vgpr0, undef renamable $vgpr0, killed renamable $sgpr6_sgpr7, 0, 0, implicit $exec :: (store (s32), addrspace 1) @@ -116,7 +87,6 @@ ; CHECK-NEXT: $sgpr6_sgpr7 = COPY killed renamable $sgpr36_sgpr37 ; CHECK-NEXT: $sgpr10_sgpr11 = COPY killed renamable $sgpr34_sgpr35 ; CHECK-NEXT: ADJCALLSTACKDOWN 0, 0, implicit-def dead $scc, implicit-def $sgpr32, implicit $sgpr32 - ; CHECK-NEXT: KILL killed renamable $vgpr1 ; CHECK-NEXT: S_ENDPGM 0 bb.0: liveins: $vgpr0, $vgpr1, $vgpr2, $sgpr4_sgpr5, $sgpr6_sgpr7, $sgpr8_sgpr9, $sgpr10_sgpr11, $sgpr14, $sgpr15, $sgpr16 diff --git a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll --- a/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-csr-frame-ptr-reg-copy.ll @@ -2,22 +2,20 @@ ; GCN-LABEL: {{^}}spill_csr_s5_copy: ; GCN: s_mov_b32 [[FP_SCRATCH_COPY:s[0-9]+]], s33 -; GCN: s_xor_saveexec_b64 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec -; GCN: v_writelane_b32 v40, [[FP_SCRATCH_COPY]], 4 +; GCN: v_writelane_b32 v41, [[FP_SCRATCH_COPY]], 0 ; GCN: s_swappc_b64 ; GCN: v_mov_b32_e32 [[K:v[0-9]+]], 9 ; GCN: buffer_store_dword [[K]], off, s[0:3], s33{{$}} -; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v40, 4 -; GCN: s_xor_saveexec_b64 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, -1 +; GCN: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], v41, 0 +; GCN: s_or_saveexec_b64 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN: s_mov_b64 exec ; GCN: s_mov_b32 s33, [[FP_SCRATCH_COPY]] ; GCN: s_setpc_b64 diff --git a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir --- a/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir +++ b/llvm/test/CodeGen/AMDGPU/spill-reg-tuple-super-reg-use.mir @@ -1,5 +1,5 @@ # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -march=amdgcn -mcpu=gfx900 -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s +# RUN: llc -march=amdgcn -mcpu=gfx900 -run-pass=si-lower-sgpr-spills,prologepilog,machine-cp -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s # Make sure the initial first $sgpr1 = COPY $sgpr2 copy is not deleted # by the copy propagation after lowering the spill. @@ -25,14 +25,12 @@ ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $sgpr8 = COPY renamable $sgpr1 - ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: renamable $sgpr8 = COPY killed renamable $sgpr1 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -64,13 +62,11 @@ ; GCN-NEXT: $sgpr8_sgpr9 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (store (s32) into %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr8_sgpr9 - ; GCN-NEXT: renamable $vgpr0 = IMPLICIT_DEF ; GCN-NEXT: renamable $sgpr1 = COPY $sgpr2 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, killed $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, killed $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, killed $vgpr0 - ; GCN-NEXT: renamable $vgpr0 = V_WRITELANE_B32 $sgpr3, 3, killed $vgpr0, implicit $sgpr0_sgpr1_sgpr2_sgpr3 - ; GCN-NEXT: KILL killed renamable $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr0, 0, $vgpr0, implicit-def $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr0_sgpr1_sgpr2_sgpr3 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr1, 1, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr2, 2, $vgpr0 + ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr3, 3, $vgpr0, implicit killed $sgpr0_sgpr1_sgpr2_sgpr3 ; GCN-NEXT: $sgpr0_sgpr1 = S_XOR_SAVEEXEC_B64 -1, implicit-def $exec, implicit-def dead $scc, implicit $exec ; GCN-NEXT: $vgpr0 = BUFFER_LOAD_DWORD_OFFSET $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec :: (load (s32) from %stack.1, addrspace 5) ; GCN-NEXT: $exec = S_MOV_B64 killed $sgpr0_sgpr1 @@ -97,12 +93,12 @@ ; GCN-LABEL: name: spill_vgpr128_use_subreg ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 8, addrspace 5) ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) - ; GCN-NEXT: renamable $vgpr8 = COPY $vgpr2, implicit $exec + ; GCN-NEXT: renamable $vgpr8 = COPY killed renamable $vgpr1 ; GCN-NEXT: S_ENDPGM 0, implicit $vgpr8 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) @@ -127,11 +123,11 @@ ; GCN-LABEL: name: spill_vgpr128_use_kill ; GCN: liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3, $vgpr4, $vgpr5, $vgpr6, $vgpr7 ; GCN-NEXT: {{ $}} - ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2, implicit $exec - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 8, addrspace 5) - ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) + ; GCN-NEXT: renamable $vgpr1 = COPY $vgpr2 + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr0, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 0, 0, 0, implicit $exec, implicit-def $vgpr0_vgpr1_vgpr2_vgpr3, implicit $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr1, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 4, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 4, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr2, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 8, 0, 0, implicit $exec :: (store (s32) into %stack.0 + 8, addrspace 5) + ; GCN-NEXT: BUFFER_STORE_DWORD_OFFSET killed $vgpr3, $sgpr100_sgpr101_sgpr102_sgpr103, $sgpr32, 12, 0, 0, implicit $exec, implicit killed $vgpr0_vgpr1_vgpr2_vgpr3 :: (store (s32) into %stack.0 + 12, addrspace 5) ; GCN-NEXT: S_ENDPGM 0 renamable $vgpr1 = COPY $vgpr2 SI_SPILL_V128_SAVE renamable killed $vgpr0_vgpr1_vgpr2_vgpr3, %stack.0, $sgpr32, 0, implicit $exec :: (store (s128) into %stack.0, align 4, addrspace 5) diff --git a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir b/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-sgpr-to-virtual-vgpr.mir +++ /dev/null @@ -1,319 +0,0 @@ -# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py -# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -run-pass=si-lower-sgpr-spills -verify-machineinstrs %s -o - | FileCheck -check-prefix=GCN %s - -# A simple SGPR spill. Implicit def for lane VGPR should be inserted just before the spill instruction. ---- -name: sgpr32_spill -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - bb.0: - liveins: $sgpr30_sgpr31, $sgpr10 - ; GCN-LABEL: name: sgpr32_spill - ; GCN: liveins: $sgpr30_sgpr31, $sgpr10 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_]], 0 - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 - S_NOP 0 - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_SETPC_B64 $sgpr30_sgpr31 -... - -# Needed an additional virtual lane register as the lanes of current register are fully occupied while spilling a wide SGPR tuple. -# There must be two implicit def for the two lane VGPRs. - ---- -name: sgpr_spill_lane_crossover -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } - - { id: 1, type: spill-slot, size: 128, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - bb.0: - liveins: $sgpr30_sgpr31, $sgpr10, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-LABEL: name: sgpr_spill_lane_crossover - ; GCN: liveins: $sgpr10, $sgpr64, $sgpr65, $sgpr66, $sgpr67, $sgpr68, $sgpr69, $sgpr70, $sgpr71, $sgpr72, $sgpr73, $sgpr74, $sgpr75, $sgpr76, $sgpr77, $sgpr78, $sgpr79, $sgpr80, $sgpr81, $sgpr82, $sgpr83, $sgpr84, $sgpr85, $sgpr86, $sgpr87, $sgpr88, $sgpr89, $sgpr90, $sgpr91, $sgpr92, $sgpr93, $sgpr94, $sgpr95, $vgpr0, $sgpr30_sgpr31, $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71, $sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79, $sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87, $sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr64, 0, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr65, 1, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr66, 2, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr67, 3, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr68, 4, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr69, 5, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr70, 6, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr71, 7, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr72, 8, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr73, 9, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr74, 10, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr75, 11, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr76, 12, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr77, 13, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr78, 14, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr79, 15, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr80, 16, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr81, 17, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr82, 18, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr83, 19, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr84, 20, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr85, 21, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr86, 22, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr87, 23, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr88, 24, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr89, 25, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr90, 26, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr91, 27, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr92, 28, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr93, 29, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr94, 30, $vgpr0 - ; GCN-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr95, 31, $vgpr0 - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr64, 1, [[V_WRITELANE_B32_1]], implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, implicit $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr65, 2, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr66, 3, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr67, 4, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr68, 5, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr69, 6, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr70, 7, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr71, 8, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr72, 9, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr73, 10, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr74, 11, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr75, 12, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr76, 13, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr77, 14, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr78, 15, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr79, 16, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr80, 17, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr81, 18, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr82, 19, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr83, 20, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr84, 21, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr85, 22, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr86, 23, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr87, 24, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr88, 25, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr89, 26, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr90, 27, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr91, 28, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr92, 29, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr93, 30, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr94, 31, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr95, 32, [[V_WRITELANE_B32_1]], implicit killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: $sgpr64 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1, implicit-def $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 - ; GCN-NEXT: $sgpr65 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; GCN-NEXT: $sgpr66 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; GCN-NEXT: $sgpr67 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; GCN-NEXT: $sgpr68 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; GCN-NEXT: $sgpr69 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 - ; GCN-NEXT: $sgpr70 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 - ; GCN-NEXT: $sgpr71 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 - ; GCN-NEXT: $sgpr72 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 - ; GCN-NEXT: $sgpr73 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10 - ; GCN-NEXT: $sgpr74 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 11 - ; GCN-NEXT: $sgpr75 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 12 - ; GCN-NEXT: $sgpr76 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 13 - ; GCN-NEXT: $sgpr77 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 14 - ; GCN-NEXT: $sgpr78 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 15 - ; GCN-NEXT: $sgpr79 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 16 - ; GCN-NEXT: $sgpr80 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 17 - ; GCN-NEXT: $sgpr81 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 18 - ; GCN-NEXT: $sgpr82 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 19 - ; GCN-NEXT: $sgpr83 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 20 - ; GCN-NEXT: $sgpr84 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 21 - ; GCN-NEXT: $sgpr85 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 22 - ; GCN-NEXT: $sgpr86 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 23 - ; GCN-NEXT: $sgpr87 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 24 - ; GCN-NEXT: $sgpr88 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 25 - ; GCN-NEXT: $sgpr89 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 26 - ; GCN-NEXT: $sgpr90 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 27 - ; GCN-NEXT: $sgpr91 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 28 - ; GCN-NEXT: $sgpr92 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 29 - ; GCN-NEXT: $sgpr93 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 30 - ; GCN-NEXT: $sgpr94 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 31 - ; GCN-NEXT: $sgpr95 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 32 - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0 - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31 - S_NOP 0 - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - SI_SPILL_S1024_SAVE killed $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95, %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_NOP 0 - renamable $sgpr64_sgpr65_sgpr66_sgpr67_sgpr68_sgpr69_sgpr70_sgpr71_sgpr72_sgpr73_sgpr74_sgpr75_sgpr76_sgpr77_sgpr78_sgpr79_sgpr80_sgpr81_sgpr82_sgpr83_sgpr84_sgpr85_sgpr86_sgpr87_sgpr88_sgpr89_sgpr90_sgpr91_sgpr92_sgpr93_sgpr94_sgpr95 = SI_SPILL_S1024_RESTORE %stack.1, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_SETPC_B64 $sgpr30_sgpr31 -... - -# The implicit def for the lane VGPR should be inserted at the common dominator block (the entry block here). - ---- -name: lane_vgpr_implicit_def_at_common_dominator_block -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - ; GCN-LABEL: name: lane_vgpr_implicit_def_at_common_dominator_block - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc - ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 - ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = S_MOV_B32 20 - ; GCN-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_1]] - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0 - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 - bb.0: - liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - S_NOP 0 - S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc - S_CBRANCH_SCC1 %bb.2, implicit killed $scc - bb.1: - liveins: $sgpr10, $sgpr30_sgpr31 - $sgpr10 = S_MOV_B32 10 - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_BRANCH %bb.3 - bb.2: - liveins: $sgpr10, $sgpr30_sgpr31 - $sgpr10 = S_MOV_B32 20 - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_BRANCH %bb.3 - bb.3: - liveins: $sgpr10, $sgpr30_sgpr31 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 -... - -# The common dominator block is visited only at the end. The insertion point was initially identified to the -# terminator instruction in the dominator block which later becomes the point where a spill get inserted in the same block. - ---- -name: dominator_block_follows_the_successors_bbs -tracksRegLiveness: true -frameInfo: - maxAlignment: 4 -stack: - - { id: 0, type: spill-slot, size: 4, alignment: 4, stack-id: sgpr-spill } -machineFunctionInfo: - isEntryFunction: false - scratchRSrcReg: '$sgpr0_sgpr1_sgpr2_sgpr3' - stackPtrOffsetReg: '$sgpr32' - frameOffsetReg: '$sgpr33' - hasSpilledSGPRs: true -body: | - ; GCN-LABEL: name: dominator_block_follows_the_successors_bbs - ; GCN: bb.0: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.1: - ; GCN-NEXT: successors: %bb.2(0x80000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[DEF]], 0 - ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc - ; GCN-NEXT: S_BRANCH %bb.2 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.2: - ; GCN-NEXT: successors: %bb.3(0x80000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = V_READLANE_B32 [[DEF]], 0 - ; GCN-NEXT: $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc - ; GCN-NEXT: S_BRANCH %bb.3 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.3: - ; GCN-NEXT: successors: %bb.2(0x40000000), %bb.1(0x40000000) - ; GCN-NEXT: liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: $sgpr10 = S_MOV_B32 10 - ; GCN-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 0, [[V_WRITELANE_B32_]] - ; GCN-NEXT: S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc - ; GCN-NEXT: S_CBRANCH_SCC1 %bb.2, implicit killed $scc - ; GCN-NEXT: S_BRANCH %bb.1 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: bb.4: - ; GCN-NEXT: liveins: $sgpr10, $sgpr30_sgpr31 - ; GCN-NEXT: {{ $}} - ; GCN-NEXT: S_NOP 0 - ; GCN-NEXT: S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 - bb.0: - liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - S_NOP 0 - S_BRANCH %bb.3 - bb.1: - liveins: $sgpr10, $sgpr30_sgpr31 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - $sgpr10 = S_ADD_I32 $sgpr10, 15, implicit-def dead $scc - S_BRANCH %bb.2 - bb.2: - liveins: $sgpr10, $sgpr30_sgpr31 - renamable $sgpr10 = SI_SPILL_S32_RESTORE %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - $sgpr10 = S_ADD_I32 $sgpr10, 20, implicit-def dead $scc - S_BRANCH %bb.3 - bb.3: - liveins: $sgpr10, $sgpr11, $sgpr30_sgpr31 - $sgpr10 = S_MOV_B32 10 - SI_SPILL_S32_SAVE killed $sgpr10, %stack.0, implicit $exec, implicit $sgpr0_sgpr1_sgpr2_sgpr3, implicit $sgpr32 - S_CMP_EQ_U32 $sgpr11, 0, implicit-def $scc - S_CBRANCH_SCC1 %bb.2, implicit killed $scc - S_BRANCH %bb.1 - bb.4: - liveins: $sgpr10, $sgpr30_sgpr31 - S_NOP 0 - S_SETPC_B64 $sgpr30_sgpr31, implicit $sgpr10 -... diff --git a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll b/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/spill-vgpr-to-agpr-update-regscavenger.ll +++ /dev/null @@ -1,92 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -O0 -verify-machineinstrs -o - %s | FileCheck %s - -; Regression test for `processFunctionBeforeFrameFinalized`: -; Check that it correctly updates RegisterScavenger so we -; don't end up with bad machine code due to using undefined -; physical registers. - -define void @test() { -; CHECK-LABEL: test: -; CHECK: ; %bb.0: ; %bb.0 -; CHECK-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: ; implicit-def: $vgpr0 -; CHECK-NEXT: .LBB0_1: ; %bb.1 -; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: s_cbranch_scc1 .LBB0_3 -; CHECK-NEXT: ; %bb.2: ; %bb.2 -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: .LBB0_3: ; %bb.3 -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: ; implicit-def: $sgpr4 -; CHECK-NEXT: v_mov_b32_e32 v1, s4 -; CHECK-NEXT: v_readfirstlane_b32 s6, v1 -; CHECK-NEXT: s_mov_b64 s[4:5], -1 -; CHECK-NEXT: s_mov_b32 s7, 0 -; CHECK-NEXT: s_cmp_eq_u32 s6, s7 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: s_mov_b64 s[10:11], exec -; CHECK-NEXT: s_mov_b64 exec, -1 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: s_cbranch_scc1 .LBB0_5 -; CHECK-NEXT: ; %bb.4: ; %bb.4 -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: s_mov_b64 s[4:5], 0 -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_accvgpr_write_b32 a0, v0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: .LBB0_5: ; %Flow -; CHECK-NEXT: ; in Loop: Header=BB0_1 Depth=1 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: s_nop 0 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: v_readlane_b32 s4, v0, 0 -; CHECK-NEXT: v_readlane_b32 s5, v0, 1 -; CHECK-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[4:5] -; CHECK-NEXT: s_mov_b32 s4, 1 -; CHECK-NEXT: ; implicit-def: $sgpr5 -; CHECK-NEXT: v_cmp_ne_u32_e64 s[4:5], v0, s4 -; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] -; CHECK-NEXT: s_cbranch_vccnz .LBB0_1 -; CHECK-NEXT: ; %bb.6: ; %bb.5 -; CHECK-NEXT: s_or_saveexec_b64 s[10:11], -1 -; CHECK-NEXT: v_accvgpr_read_b32 v0, a0 ; Reload Reuse -; CHECK-NEXT: s_mov_b64 exec, s[10:11] -; CHECK-NEXT: ; kill: killed $vgpr0 -; CHECK-NEXT: s_xor_saveexec_b64 s[4:5], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[4:5] -; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: s_setpc_b64 s[30:31] -bb.0: - br label %bb.1 -bb.1: ; preds = %bb.4, %bb.0 - br i1 poison, label %bb.2, label %bb.3 -bb.2: ; preds = %bb.1 - br label %bb.3 -bb.3: ; preds = %bb.2, %bb.1 - %call = tail call i32 @llvm.amdgcn.readfirstlane(i32 poison) - %cmp = icmp eq i32 %call, 0 - br i1 %cmp, label %bb.5, label %bb.4 -bb.4: ; preds = %bb.3 - br label %bb.1 -bb.5: ; preds = %bb.3 - ret void -} - -declare i32 @llvm.amdgcn.readfirstlane(i32) diff --git a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll --- a/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll +++ b/llvm/test/CodeGen/AMDGPU/spill-writelane-vgprs.ll @@ -10,16 +10,16 @@ ; GCN-LABEL: sgpr_spill_writelane: ; GCN: ; %bb.0: ; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: v_writelane_b32 v0, s35, 0 ; GCN-NEXT: ;;#ASMSTART ; GCN-NEXT: ;;#ASMEND ; GCN-NEXT: v_readlane_b32 s35, v0, 0 -; GCN-NEXT: s_xor_saveexec_b64 s[4:5], -1 +; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[4:5] +; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: s_setpc_b64 s[30:31] call void asm sideeffect "", "~{s35}"() diff --git a/llvm/test/CodeGen/AMDGPU/spill192.mir b/llvm/test/CodeGen/AMDGPU/spill192.mir --- a/llvm/test/CodeGen/AMDGPU/spill192.mir +++ b/llvm/test/CodeGen/AMDGPU/spill192.mir @@ -32,29 +32,32 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr192 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr9, 5, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr9, 5, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9 bb.0: S_NOP 0, implicit-def %0:sgpr_192 diff --git a/llvm/test/CodeGen/AMDGPU/spill224.mir b/llvm/test/CodeGen/AMDGPU/spill224.mir --- a/llvm/test/CodeGen/AMDGPU/spill224.mir +++ b/llvm/test/CodeGen/AMDGPU/spill224.mir @@ -30,31 +30,34 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr224 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr10, 6, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr10, 6, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10 bb.0: S_NOP 0, implicit-def %0:sgpr_224 diff --git a/llvm/test/CodeGen/AMDGPU/spill288.mir b/llvm/test/CodeGen/AMDGPU/spill288.mir --- a/llvm/test/CodeGen/AMDGPU/spill288.mir +++ b/llvm/test/CodeGen/AMDGPU/spill288.mir @@ -30,35 +30,38 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr288 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr12, 8, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr12, 8, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12 bb.0: S_NOP 0, implicit-def %0:sgpr_288 diff --git a/llvm/test/CodeGen/AMDGPU/spill320.mir b/llvm/test/CodeGen/AMDGPU/spill320.mir --- a/llvm/test/CodeGen/AMDGPU/spill320.mir +++ b/llvm/test/CodeGen/AMDGPU/spill320.mir @@ -30,37 +30,40 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr320 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr13, 9, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr13, 9, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13 bb.0: S_NOP 0, implicit-def %0:sgpr_320 diff --git a/llvm/test/CodeGen/AMDGPU/spill352.mir b/llvm/test/CodeGen/AMDGPU/spill352.mir --- a/llvm/test/CodeGen/AMDGPU/spill352.mir +++ b/llvm/test/CodeGen/AMDGPU/spill352.mir @@ -30,39 +30,42 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr352 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr14, 10, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 9, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr14, 10, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 - ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 + ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 10 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14 bb.0: S_NOP 0, implicit-def %0:sgpr_352 diff --git a/llvm/test/CodeGen/AMDGPU/spill384.mir b/llvm/test/CodeGen/AMDGPU/spill384.mir --- a/llvm/test/CodeGen/AMDGPU/spill384.mir +++ b/llvm/test/CodeGen/AMDGPU/spill384.mir @@ -30,41 +30,44 @@ ; EXPANDED-LABEL: name: spill_restore_sgpr384 ; EXPANDED: bb.0: ; EXPANDED-NEXT: successors: %bb.1(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} - ; EXPANDED-NEXT: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF ; EXPANDED-NEXT: S_NOP 0, implicit-def renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr4, 0, [[V_WRITELANE_B32_]], implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr5, 1, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr6, 2, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr7, 3, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr8, 4, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr9, 5, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr10, 6, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr11, 7, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr12, 8, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr13, 9, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 $sgpr14, 10, [[V_WRITELANE_B32_1]] - ; EXPANDED-NEXT: [[V_WRITELANE_B32_1:%[0-9]+]]:vgpr_32 = V_WRITELANE_B32 killed $sgpr15, 11, [[V_WRITELANE_B32_1]], implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr4, 0, $vgpr0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15, implicit $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr5, 1, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr6, 2, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr7, 3, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr8, 4, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr9, 5, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr10, 6, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr11, 7, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr12, 8, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr13, 9, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 $sgpr14, 10, $vgpr0 + ; EXPANDED-NEXT: $vgpr0 = V_WRITELANE_B32 killed $sgpr15, 11, $vgpr0, implicit killed $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 ; EXPANDED-NEXT: S_CBRANCH_SCC1 %bb.1, implicit undef $scc ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.1: ; EXPANDED-NEXT: successors: %bb.2(0x80000000) + ; EXPANDED-NEXT: liveins: $vgpr0 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: S_NOP 1 ; EXPANDED-NEXT: {{ $}} ; EXPANDED-NEXT: bb.2: - ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 - ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 1 - ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 2 - ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 3 - ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 4 - ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 5 - ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 6 - ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 7 - ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 8 - ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 9 - ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 10 - ; EXPANDED-NEXT: $sgpr15 = V_READLANE_B32 [[V_WRITELANE_B32_1]], 11 + ; EXPANDED-NEXT: liveins: $vgpr0 + ; EXPANDED-NEXT: {{ $}} + ; EXPANDED-NEXT: $sgpr4 = V_READLANE_B32 $vgpr0, 0, implicit-def $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 + ; EXPANDED-NEXT: $sgpr5 = V_READLANE_B32 $vgpr0, 1 + ; EXPANDED-NEXT: $sgpr6 = V_READLANE_B32 $vgpr0, 2 + ; EXPANDED-NEXT: $sgpr7 = V_READLANE_B32 $vgpr0, 3 + ; EXPANDED-NEXT: $sgpr8 = V_READLANE_B32 $vgpr0, 4 + ; EXPANDED-NEXT: $sgpr9 = V_READLANE_B32 $vgpr0, 5 + ; EXPANDED-NEXT: $sgpr10 = V_READLANE_B32 $vgpr0, 6 + ; EXPANDED-NEXT: $sgpr11 = V_READLANE_B32 $vgpr0, 7 + ; EXPANDED-NEXT: $sgpr12 = V_READLANE_B32 $vgpr0, 8 + ; EXPANDED-NEXT: $sgpr13 = V_READLANE_B32 $vgpr0, 9 + ; EXPANDED-NEXT: $sgpr14 = V_READLANE_B32 $vgpr0, 10 + ; EXPANDED-NEXT: $sgpr15 = V_READLANE_B32 $vgpr0, 11 ; EXPANDED-NEXT: S_NOP 0, implicit killed renamable $sgpr4_sgpr5_sgpr6_sgpr7_sgpr8_sgpr9_sgpr10_sgpr11_sgpr12_sgpr13_sgpr14_sgpr15 bb.0: S_NOP 0, implicit-def %0:sgpr_384 diff --git a/llvm/test/CodeGen/AMDGPU/stack-realign.ll b/llvm/test/CodeGen/AMDGPU/stack-realign.ll --- a/llvm/test/CodeGen/AMDGPU/stack-realign.ll +++ b/llvm/test/CodeGen/AMDGPU/stack-realign.ll @@ -161,24 +161,26 @@ ; GCN-NEXT: s_and_b32 s33, [[SCRATCH_REG]], 0xffff0000 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword [[VGPR_REG:v[0-9]+]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword [[VGPR_REG_1:v[0-9]+]], off, s[0:3], s33 offset:1032 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] ; GCN-NEXT: v_mov_b32_e32 v32, 0 -; GCN-DAG: v_writelane_b32 [[VGPR_REG]], s34, 3 +; GCN-DAG: v_writelane_b32 [[VGPR_REG_1]], s34, 1 ; GCN: s_mov_b32 s34, s32 ; GCN: buffer_store_dword v32, off, s[0:3], s33 offset:1024 ; GCN-NEXT: s_waitcnt vmcnt(0) ; GCN-NEXT: buffer_load_dword v{{[0-9]+}}, off, s[0:3], s34 ; GCN-DAG: s_add_i32 s32, s32, 0x30000 -; GCN: v_writelane_b32 [[VGPR_REG]], [[FP_SCRATCH_COPY]], 2 +; GCN: v_writelane_b32 [[VGPR_REG_1]], [[FP_SCRATCH_COPY]], 0 ; GCN: buffer_store_dword v{{[0-9]+}}, off, s[0:3], s32 ; GCN: s_swappc_b64 s[30:31], ; GCN: v_readlane_b32 s31, [[VGPR_REG]], 1 ; GCN: v_readlane_b32 s30, [[VGPR_REG]], 0 -; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG]], 3 -; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG]], 2 +; GCN-NEXT: v_readlane_b32 s34, [[VGPR_REG_1]], 1 +; GCN-NEXT: v_readlane_b32 [[FP_SCRATCH_COPY:s[0-9]+]], [[VGPR_REG_1]], 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword [[VGPR_REG]], off, s[0:3], s33 offset:1028 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword [[VGPR_REG_1]], off, s[0:3], s33 offset:1032 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_add_i32 s32, s32, 0xfffd0000 ; GCN-NEXT: s_mov_b32 s33, [[FP_SCRATCH_COPY]] diff --git a/llvm/test/CodeGen/AMDGPU/swdev380865.ll b/llvm/test/CodeGen/AMDGPU/swdev380865.ll --- a/llvm/test/CodeGen/AMDGPU/swdev380865.ll +++ b/llvm/test/CodeGen/AMDGPU/swdev380865.ll @@ -16,96 +16,94 @@ ; CHECK: ; %bb.0: ; %entry ; CHECK-NEXT: s_mov_b64 s[0:1], 0 ; CHECK-NEXT: s_load_dword s2, s[0:1], 0x0 -; CHECK-NEXT: ; implicit-def: $vgpr2 ; CHECK-NEXT: ; kill: killed $sgpr0_sgpr1 ; CHECK-NEXT: s_mov_b32 s7, 0x401c0000 ; CHECK-NEXT: s_mov_b32 s5, 0x40280000 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_writelane_b32 v2, s2, 0 +; CHECK-NEXT: v_writelane_b32 v0, s2, 0 ; CHECK-NEXT: s_load_dwordx2 s[2:3], s[0:1], 0x0 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s1, 0x40180000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 1 -; CHECK-NEXT: v_writelane_b32 v2, s1, 2 +; CHECK-NEXT: v_writelane_b32 v0, s0, 1 +; CHECK-NEXT: v_writelane_b32 v0, s1, 2 ; CHECK-NEXT: s_mov_b32 s1, 0x40220000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 3 -; CHECK-NEXT: v_writelane_b32 v2, s1, 4 +; CHECK-NEXT: v_writelane_b32 v0, s0, 3 +; CHECK-NEXT: v_writelane_b32 v0, s1, 4 ; CHECK-NEXT: s_mov_b32 s1, 0x40240000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 5 -; CHECK-NEXT: v_writelane_b32 v2, s1, 6 +; CHECK-NEXT: v_writelane_b32 v0, s0, 5 +; CHECK-NEXT: v_writelane_b32 v0, s1, 6 ; CHECK-NEXT: s_mov_b32 s1, 0x40260000 -; CHECK-NEXT: v_writelane_b32 v2, s0, 7 +; CHECK-NEXT: v_writelane_b32 v0, s0, 7 ; CHECK-NEXT: s_waitcnt lgkmcnt(0) -; CHECK-NEXT: v_mov_b32_e32 v0, s2 -; CHECK-NEXT: v_writelane_b32 v2, s1, 8 -; CHECK-NEXT: v_mov_b32_e32 v1, s3 +; CHECK-NEXT: v_mov_b32_e32 v1, s2 +; CHECK-NEXT: v_writelane_b32 v0, s1, 8 +; CHECK-NEXT: v_mov_b32_e32 v2, s3 ; CHECK-NEXT: .LBB0_1: ; %for.cond4.preheader ; CHECK-NEXT: ; =>This Inner Loop Header: Depth=1 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], 0 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], 0 ; CHECK-NEXT: s_mov_b32 s2, 0 ; CHECK-NEXT: s_mov_b32 s3, 0x40140000 -; CHECK-NEXT: v_writelane_b32 v2, s6, 9 -; CHECK-NEXT: v_writelane_b32 v2, s7, 10 -; CHECK-NEXT: v_writelane_b32 v2, s0, 11 -; CHECK-NEXT: v_readlane_b32 s6, v2, 1 -; CHECK-NEXT: v_readlane_b32 s7, v2, 2 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; CHECK-NEXT: v_writelane_b32 v0, s6, 9 +; CHECK-NEXT: v_writelane_b32 v0, s7, 10 +; CHECK-NEXT: v_writelane_b32 v0, s0, 11 +; CHECK-NEXT: v_readlane_b32 s6, v0, 1 +; CHECK-NEXT: v_readlane_b32 s7, v0, 2 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] ; CHECK-NEXT: s_mov_b32 s1, s7 ; CHECK-NEXT: s_mov_b32 s0, s2 -; CHECK-NEXT: v_writelane_b32 v2, s6, 1 -; CHECK-NEXT: v_writelane_b32 v2, s7, 2 -; CHECK-NEXT: v_readlane_b32 s6, v2, 9 -; CHECK-NEXT: v_readlane_b32 s7, v2, 10 +; CHECK-NEXT: v_writelane_b32 v0, s6, 1 +; CHECK-NEXT: v_writelane_b32 v0, s7, 2 +; CHECK-NEXT: v_readlane_b32 s6, v0, 9 +; CHECK-NEXT: v_readlane_b32 s7, v0, 10 ; CHECK-NEXT: s_mov_b32 s6, s2 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[0:1] -; CHECK-NEXT: v_readlane_b32 s0, v2, 3 -; CHECK-NEXT: v_readlane_b32 s1, v2, 4 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[0:1] +; CHECK-NEXT: v_readlane_b32 s0, v0, 3 +; CHECK-NEXT: v_readlane_b32 s1, v0, 4 ; CHECK-NEXT: s_mov_b32 s3, s1 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s2, s0 ; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[6:7] -; CHECK-NEXT: v_writelane_b32 v2, s0, 3 -; CHECK-NEXT: v_writelane_b32 v2, s1, 4 -; CHECK-NEXT: v_readlane_b32 s0, v2, 5 -; CHECK-NEXT: v_readlane_b32 s1, v2, 6 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[6:7] +; CHECK-NEXT: v_writelane_b32 v0, s0, 3 +; CHECK-NEXT: v_writelane_b32 v0, s1, 4 +; CHECK-NEXT: v_readlane_b32 s0, v0, 5 +; CHECK-NEXT: v_readlane_b32 s1, v0, 6 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] ; CHECK-NEXT: s_mov_b32 s3, s1 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s2, s0 ; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: v_writelane_b32 v2, s0, 5 -; CHECK-NEXT: v_writelane_b32 v2, s1, 6 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] -; CHECK-NEXT: v_readlane_b32 s0, v2, 7 -; CHECK-NEXT: v_readlane_b32 s1, v2, 8 +; CHECK-NEXT: v_writelane_b32 v0, s0, 5 +; CHECK-NEXT: v_writelane_b32 v0, s1, 6 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] +; CHECK-NEXT: v_readlane_b32 s0, v0, 7 +; CHECK-NEXT: v_readlane_b32 s1, v0, 8 ; CHECK-NEXT: s_mov_b32 s3, s1 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s2, s0 ; CHECK-NEXT: s_mov_b32 s1, s3 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[2:3] -; CHECK-NEXT: v_writelane_b32 v2, s0, 7 -; CHECK-NEXT: v_writelane_b32 v2, s1, 8 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[2:3] +; CHECK-NEXT: v_writelane_b32 v0, s0, 7 +; CHECK-NEXT: v_writelane_b32 v0, s1, 8 ; CHECK-NEXT: s_mov_b32 s0, 0 ; CHECK-NEXT: s_mov_b32 s1, 0x40140000 ; CHECK-NEXT: s_mov_b32 s4, s0 -; CHECK-NEXT: v_readlane_b32 s0, v2, 0 -; CHECK-NEXT: v_readlane_b32 s2, v2, 11 -; CHECK-NEXT: v_add_f64 v[0:1], v[0:1], s[4:5] +; CHECK-NEXT: v_readlane_b32 s0, v0, 0 +; CHECK-NEXT: v_readlane_b32 s2, v0, 11 +; CHECK-NEXT: v_add_f64 v[1:2], v[1:2], s[4:5] ; CHECK-NEXT: s_add_i32 s2, s2, s0 -; CHECK-NEXT: v_writelane_b32 v2, s2, 11 -; CHECK-NEXT: v_readlane_b32 s0, v2, 11 +; CHECK-NEXT: v_writelane_b32 v0, s2, 11 +; CHECK-NEXT: v_readlane_b32 s0, v0, 11 ; CHECK-NEXT: s_cmpk_lt_i32 s0, 0xa00 ; CHECK-NEXT: s_cbranch_scc1 .LBB0_1 ; CHECK-NEXT: ; %bb.2: ; %for.cond.cleanup.loopexit ; CHECK-NEXT: v_mov_b32_e32 v3, 0 ; CHECK-NEXT: v_mov_b32_e32 v4, 0 -; CHECK-NEXT: global_store_dwordx2 v[3:4], v[0:1], off -; CHECK-NEXT: ; kill: killed $vgpr2 +; CHECK-NEXT: global_store_dwordx2 v[3:4], v[1:2], off ; CHECK-NEXT: s_endpgm entry: %0 = load i32, ptr addrspace(4) null, align 4 diff --git a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll --- a/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll +++ b/llvm/test/CodeGen/AMDGPU/tuple-allocation-failure.ll @@ -12,9 +12,10 @@ ; GFX90A-NEXT: s_mov_b32 s33, s32 ; GFX90A-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX90A-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX90A-NEXT: s_mov_b64 exec, s[18:19] ; GFX90A-NEXT: s_addk_i32 s32, 0x400 -; GFX90A-NEXT: v_writelane_b32 v40, s16, 2 +; GFX90A-NEXT: v_writelane_b32 v41, s16, 0 ; GFX90A-NEXT: s_getpc_b64 s[16:17] ; GFX90A-NEXT: s_add_u32 s16, s16, wobble@gotpcrel32@lo+4 ; GFX90A-NEXT: s_addc_u32 s17, s17, wobble@gotpcrel32@hi+12 @@ -34,12 +35,12 @@ ; GLOBALNESS1-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS1-NEXT: s_load_dwordx4 s[76:79], s[8:9], 0x0 ; GLOBALNESS1-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, 0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v42, v0 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v40, 0 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS1-NEXT: global_store_dword v[0:1], v42, off +; GLOBALNESS1-NEXT: global_store_dword v[0:1], v40, off ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: global_load_dword v0, v42, s[76:77] +; GLOBALNESS1-NEXT: global_load_dword v0, v40, s[76:77] ; GLOBALNESS1-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS1-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS1-NEXT: s_load_dword s7, s[8:9], 0x20 @@ -47,10 +48,10 @@ ; GLOBALNESS1-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS1-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS1-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, 0x40994400 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, 0x40994400 ; GLOBALNESS1-NEXT: s_bitcmp1_b32 s78, 0 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43] +; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[40:41] ; GLOBALNESS1-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 ; GLOBALNESS1-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GLOBALNESS1-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -116,9 +117,9 @@ ; GLOBALNESS1-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS1-NEXT: ; Child Loop BB1_15 Depth 2 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[0:1], s[74:75], s[74:75] op_sel:[0,1] -; GLOBALNESS1-NEXT: flat_load_dword v40, v[0:1] +; GLOBALNESS1-NEXT: flat_load_dword v43, v[0:1] ; GLOBALNESS1-NEXT: s_add_u32 s8, s38, 40 -; GLOBALNESS1-NEXT: buffer_store_dword v42, off, s[0:3], 0 +; GLOBALNESS1-NEXT: buffer_store_dword v40, off, s[0:3], 0 ; GLOBALNESS1-NEXT: flat_load_dword v46, v[0:1] ; GLOBALNESS1-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -127,7 +128,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS1-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS1-NEXT: s_and_b64 vcc, exec, s[46:47] @@ -170,13 +171,13 @@ ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS1-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS1-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS1-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 -; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc +; GLOBALNESS1-NEXT: v_cndmask_b32_e32 v2, 0, v43, vcc ; GLOBALNESS1-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS1-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1] ; GLOBALNESS1-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2 @@ -223,7 +224,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS1-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -233,15 +234,15 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS1-NEXT: s_swappc_b64 s[30:31], s[76:77] ; GLOBALNESS1-NEXT: s_and_saveexec_b64 s[4:5], s[66:67] ; GLOBALNESS1-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS1-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[42:43], off +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS1-NEXT: global_store_dwordx2 v[46:47], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_13 ; GLOBALNESS1-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -268,15 +269,15 @@ ; GLOBALNESS1-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS1-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_1 ; GLOBALNESS1-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS1-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS1-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS1-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS1-NEXT: s_branch .LBB1_2 ; GLOBALNESS1-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS1-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -291,7 +292,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -309,7 +310,7 @@ ; GLOBALNESS1-NEXT: s_mov_b32 s12, s72 ; GLOBALNESS1-NEXT: s_mov_b32 s13, s71 ; GLOBALNESS1-NEXT: s_mov_b32 s14, s70 -; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS1-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS1-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS1-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS1-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -321,12 +322,12 @@ ; GLOBALNESS0-NEXT: s_mov_b64 s[36:37], s[6:7] ; GLOBALNESS0-NEXT: s_load_dwordx4 s[72:75], s[8:9], 0x0 ; GLOBALNESS0-NEXT: s_load_dword s6, s[8:9], 0x14 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, 0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v42, v0 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v40, 0 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], 0, 0 -; GLOBALNESS0-NEXT: global_store_dword v[0:1], v42, off +; GLOBALNESS0-NEXT: global_store_dword v[0:1], v40, off ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: global_load_dword v0, v42, s[72:73] +; GLOBALNESS0-NEXT: global_load_dword v0, v40, s[72:73] ; GLOBALNESS0-NEXT: s_mov_b64 s[40:41], s[4:5] ; GLOBALNESS0-NEXT: s_load_dwordx2 s[4:5], s[8:9], 0x18 ; GLOBALNESS0-NEXT: s_load_dword s7, s[8:9], 0x20 @@ -334,10 +335,10 @@ ; GLOBALNESS0-NEXT: s_addc_u32 flat_scratch_hi, s13, 0 ; GLOBALNESS0-NEXT: s_add_u32 s0, s0, s17 ; GLOBALNESS0-NEXT: s_addc_u32 s1, s1, 0 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, 0x40994400 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, 0x40994400 ; GLOBALNESS0-NEXT: s_bitcmp1_b32 s74, 0 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) -; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[42:43] +; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e32 vcc, s[4:5], v[40:41] ; GLOBALNESS0-NEXT: v_cmp_ngt_f64_e64 s[4:5], s[4:5], 0 ; GLOBALNESS0-NEXT: v_cndmask_b32_e64 v2, 0, 1, s[4:5] ; GLOBALNESS0-NEXT: s_cselect_b64 s[4:5], -1, 0 @@ -403,9 +404,9 @@ ; GLOBALNESS0-NEXT: ; =>This Loop Header: Depth=1 ; GLOBALNESS0-NEXT: ; Child Loop BB1_15 Depth 2 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[0:1], s[76:77], s[76:77] op_sel:[0,1] -; GLOBALNESS0-NEXT: flat_load_dword v40, v[0:1] +; GLOBALNESS0-NEXT: flat_load_dword v43, v[0:1] ; GLOBALNESS0-NEXT: s_add_u32 s8, s38, 40 -; GLOBALNESS0-NEXT: buffer_store_dword v42, off, s[0:3], 0 +; GLOBALNESS0-NEXT: buffer_store_dword v40, off, s[0:3], 0 ; GLOBALNESS0-NEXT: flat_load_dword v46, v[0:1] ; GLOBALNESS0-NEXT: s_addc_u32 s9, s39, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -414,7 +415,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS0-NEXT: s_waitcnt lgkmcnt(0) ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS0-NEXT: s_and_b64 vcc, exec, s[46:47] @@ -457,13 +458,13 @@ ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_12 ; GLOBALNESS0-NEXT: ; %bb.11: ; %bb39.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS0-NEXT: .LBB1_12: ; %bb44.lr.ph.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 ; GLOBALNESS0-NEXT: v_cmp_ne_u32_e32 vcc, 0, v46 -; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v40, vcc +; GLOBALNESS0-NEXT: v_cndmask_b32_e32 v2, 0, v43, vcc ; GLOBALNESS0-NEXT: s_waitcnt vmcnt(0) ; GLOBALNESS0-NEXT: v_cmp_nlt_f64_e64 s[64:65], 0, v[0:1] ; GLOBALNESS0-NEXT: v_cmp_eq_u32_e64 s[66:67], 0, v2 @@ -510,7 +511,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[46:47], 0, 0 ; GLOBALNESS0-NEXT: s_mov_b64 s[4:5], s[40:41] @@ -520,15 +521,15 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[44:45], off ; GLOBALNESS0-NEXT: s_swappc_b64 s[30:31], s[78:79] ; GLOBALNESS0-NEXT: s_and_saveexec_b64 s[4:5], s[66:67] ; GLOBALNESS0-NEXT: s_cbranch_execz .LBB1_13 ; GLOBALNESS0-NEXT: ; %bb.22: ; %bb62.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_15 Depth=2 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[42:43], off +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 +; GLOBALNESS0-NEXT: global_store_dwordx2 v[46:47], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_13 ; GLOBALNESS0-NEXT: .LBB1_23: ; %LeafBlock ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 @@ -555,15 +556,15 @@ ; GLOBALNESS0-NEXT: s_cbranch_vccnz .LBB1_1 ; GLOBALNESS0-NEXT: ; %bb.28: ; %bb69.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_1 ; GLOBALNESS0-NEXT: .LBB1_29: ; %bb73.i ; GLOBALNESS0-NEXT: ; in Loop: Header=BB1_4 Depth=1 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v43, v42 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v41, v40 ; GLOBALNESS0-NEXT: v_pk_mov_b32 v[2:3], 0, 0 -; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[42:43], off +; GLOBALNESS0-NEXT: global_store_dwordx2 v[2:3], v[40:41], off ; GLOBALNESS0-NEXT: s_branch .LBB1_2 ; GLOBALNESS0-NEXT: .LBB1_30: ; %loop.exit.guard ; GLOBALNESS0-NEXT: s_andn2_b64 vcc, exec, s[4:5] @@ -578,7 +579,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 @@ -596,7 +597,7 @@ ; GLOBALNESS0-NEXT: s_mov_b32 s12, s70 ; GLOBALNESS0-NEXT: s_mov_b32 s13, s69 ; GLOBALNESS0-NEXT: s_mov_b32 s14, s68 -; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v41 +; GLOBALNESS0-NEXT: v_mov_b32_e32 v31, v42 ; GLOBALNESS0-NEXT: s_getpc_b64 s[16:17] ; GLOBALNESS0-NEXT: s_add_u32 s16, s16, widget@rel32@lo+4 ; GLOBALNESS0-NEXT: s_addc_u32 s17, s17, widget@rel32@hi+12 diff --git a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll --- a/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll +++ b/llvm/test/CodeGen/AMDGPU/unstructured-cfg-def-use-issue.ll @@ -10,8 +10,9 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 16 +; GCN-NEXT: v_writelane_b32 v42, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x400 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 ; 4-byte Folded Spill ; GCN-NEXT: v_writelane_b32 v40, s30, 0 @@ -119,9 +120,10 @@ ; GCN-NEXT: v_readlane_b32 s31, v40, 1 ; GCN-NEXT: v_readlane_b32 s30, v40, 0 ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: v_readlane_b32 s4, v40, 16 +; GCN-NEXT: v_readlane_b32 s4, v42, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xfc00 ; GCN-NEXT: s_mov_b32 s33, s4 @@ -267,8 +269,9 @@ ; GCN-NEXT: s_mov_b32 s33, s32 ; GCN-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GCN-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s16, 28 +; GCN-NEXT: v_writelane_b32 v46, s16, 0 ; GCN-NEXT: s_addk_i32 s32, 0x800 ; GCN-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GCN-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill @@ -471,9 +474,10 @@ ; GCN-NEXT: buffer_load_dword v43, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GCN-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GCN-NEXT: v_readlane_b32 s4, v40, 28 +; GCN-NEXT: v_readlane_b32 s4, v46, 0 ; GCN-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GCN-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GCN-NEXT: s_mov_b64 exec, s[6:7] ; GCN-NEXT: s_addk_i32 s32, 0xf800 ; GCN-NEXT: s_mov_b32 s33, s4 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-spill-placement-issue61083.ll @@ -13,61 +13,47 @@ ; CHECK: ; %bb.0: ; %bb ; CHECK-NEXT: s_add_u32 s0, s0, s15 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr1 ; CHECK-NEXT: v_mov_b32_e32 v2, v0 -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[8:9] -; CHECK-NEXT: v_mov_b32_e32 v1, 0 -; CHECK-NEXT: global_load_ushort v3, v1, s[4:5] offset:4 +; CHECK-NEXT: v_mov_b32_e32 v0, 0 +; CHECK-NEXT: global_load_ushort v3, v0, s[4:5] offset:4 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:8 ; 4-byte Folded Spill +; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill ; CHECK-NEXT: ; implicit-def: $sgpr4 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: v_cmp_eq_u32_e64 s[6:7], v2, s4 ; CHECK-NEXT: s_mov_b32 s4, 0 ; CHECK-NEXT: v_mov_b32_e32 v2, s4 -; CHECK-NEXT: ds_write_b8 v1, v2 +; CHECK-NEXT: ds_write_b8 v0, v2 ; CHECK-NEXT: s_mov_b64 s[4:5], exec -; CHECK-NEXT: v_writelane_b32 v0, s4, 0 -; CHECK-NEXT: v_writelane_b32 v0, s5, 1 -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_store_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b64 exec, s[8:9] +; CHECK-NEXT: v_writelane_b32 v1, s4, 0 +; CHECK-NEXT: v_writelane_b32 v1, s5, 1 ; CHECK-NEXT: s_and_b64 s[4:5], s[4:5], s[6:7] ; CHECK-NEXT: s_mov_b64 exec, s[4:5] ; CHECK-NEXT: s_cbranch_execz .LBB0_2 ; CHECK-NEXT: ; %bb.1: ; %bb193 ; CHECK-NEXT: .LBB0_2: ; %bb194 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[8:9] -; CHECK-NEXT: s_waitcnt vmcnt(0) +; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: v_readlane_b32 s4, v1, 0 ; CHECK-NEXT: v_readlane_b32 s5, v1, 1 ; CHECK-NEXT: s_or_b64 exec, exec, s[4:5] ; CHECK-NEXT: s_mov_b32 s4, 0 +; CHECK-NEXT: s_waitcnt vmcnt(0) ; CHECK-NEXT: v_cmp_ne_u16_e64 s[4:5], v0, s4 ; CHECK-NEXT: s_and_b64 vcc, exec, s[4:5] ; CHECK-NEXT: s_cbranch_vccnz .LBB0_4 ; CHECK-NEXT: ; %bb.3: ; %bb201 -; CHECK-NEXT: buffer_load_dword v1, off, s[0:3], 0 offset:8 ; 4-byte Folded Reload +; CHECK-NEXT: buffer_load_dword v2, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload ; CHECK-NEXT: s_getpc_b64 s[4:5] ; CHECK-NEXT: s_add_u32 s4, s4, V2@rel32@lo+4 ; CHECK-NEXT: s_addc_u32 s5, s5, V2@rel32@hi+12 ; CHECK-NEXT: v_mov_b32_e32 v0, 0 ; CHECK-NEXT: s_waitcnt vmcnt(0) -; CHECK-NEXT: global_store_short v0, v1, s[4:5] +; CHECK-NEXT: global_store_short v0, v2, s[4:5] ; CHECK-NEXT: s_waitcnt vmcnt(0) lgkmcnt(0) ; CHECK-NEXT: s_barrier ; CHECK-NEXT: s_trap 2 ; CHECK-NEXT: ; divergent unreachable ; CHECK-NEXT: .LBB0_4: ; %UnifiedReturnBlock -; CHECK-NEXT: s_or_saveexec_b64 s[8:9], -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b64 exec, s[8:9] -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm bb: %i10 = tail call i32 @llvm.amdgcn.workitem.id.x() diff --git a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr-tuple-allocation.ll @@ -15,6 +15,7 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: v_mov_b32_e32 v36, v16 ; GFX9-NEXT: v_mov_b32_e32 v35, v15 @@ -35,7 +36,7 @@ ; GFX9-NEXT: ;;#ASMEND ; GFX9-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s4, 2 +; GFX9-NEXT: v_writelane_b32 v45, s4, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -54,9 +55,10 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s4, v45, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -70,6 +72,7 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: v_mov_b32_e32 v36, v16 @@ -91,12 +94,12 @@ ; GFX10-NEXT: ;;#ASMEND ; GFX10-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v40, s4, 2 +; GFX10-NEXT: v_writelane_b32 v45, s4, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s30, 0 +; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 ; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt lgkmcnt(0) ; GFX10-NEXT: s_swappc_b64 s[30:31], s[4:5] @@ -111,9 +114,11 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:12 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s4, v40, 2 +; GFX10-NEXT: v_readlane_b32 s4, v45, 0 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:16 +; GFX10-NEXT: buffer_load_dword v45, off, s[0:3], s33 offset:20 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 @@ -127,7 +132,9 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:16 +; GFX11-NEXT: scratch_store_b32 off, v45, s33 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: v_dual_mov_b32 v36, v16 :: v_dual_mov_b32 v35, v15 ; GFX11-NEXT: v_dual_mov_b32 v34, v14 :: v_dual_mov_b32 v33, v13 @@ -147,12 +154,12 @@ ; GFX11-NEXT: ;;#ASMEND ; GFX11-NEXT: image_gather4_c_b_cl v[41:44], v[32:36], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v45, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 ; GFX11-NEXT: s_waitcnt lgkmcnt(0) ; GFX11-NEXT: s_swappc_b64 s[30:31], s[0:1] @@ -165,9 +172,11 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:12 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v45, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:16 +; GFX11-NEXT: scratch_load_b32 v45, off, s33 offset:20 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 @@ -205,6 +214,7 @@ ; GFX9-NEXT: s_mov_b32 s33, s32 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-NEXT: buffer_store_dword v42, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill @@ -218,7 +228,7 @@ ; GFX9-NEXT: v_mov_b32_e32 v41, v12 ; GFX9-NEXT: image_gather4_c_b_cl v[0:3], v[41:45], s[4:11], s[4:7] dmask:0x1 ; GFX9-NEXT: s_addk_i32 s32, 0x800 -; GFX9-NEXT: v_writelane_b32 v40, s4, 2 +; GFX9-NEXT: v_writelane_b32 v46, s4, 0 ; GFX9-NEXT: s_getpc_b64 s[4:5] ; GFX9-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX9-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 @@ -238,9 +248,10 @@ ; GFX9-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload ; GFX9-NEXT: v_readlane_b32 s31, v40, 1 ; GFX9-NEXT: v_readlane_b32 s30, v40, 0 -; GFX9-NEXT: v_readlane_b32 s4, v40, 2 +; GFX9-NEXT: v_readlane_b32 s4, v46, 0 ; GFX9-NEXT: s_or_saveexec_b64 s[6:7], -1 ; GFX9-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload ; GFX9-NEXT: s_mov_b64 exec, s[6:7] ; GFX9-NEXT: s_addk_i32 s32, 0xf800 ; GFX9-NEXT: s_mov_b32 s33, s4 @@ -254,6 +265,7 @@ ; GFX10-NEXT: s_mov_b32 s33, s32 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 ; GFX10-NEXT: buffer_store_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX10-NEXT: buffer_store_dword v46, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill @@ -263,18 +275,18 @@ ; GFX10-NEXT: buffer_store_dword v45, off, s[0:3], s33 ; 4-byte Folded Spill ; GFX10-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[4:11], s[4:7] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX10-NEXT: s_addk_i32 s32, 0x400 -; GFX10-NEXT: v_writelane_b32 v40, s4, 2 +; GFX10-NEXT: v_writelane_b32 v46, s4, 0 ; GFX10-NEXT: s_getpc_b64 s[4:5] ; GFX10-NEXT: s_add_u32 s4, s4, extern_func@gotpcrel32@lo+4 ; GFX10-NEXT: s_addc_u32 s5, s5, extern_func@gotpcrel32@hi+12 -; GFX10-NEXT: v_mov_b32_e32 v41, v16 +; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: s_load_dwordx2 s[4:5], s[4:5], 0x0 +; GFX10-NEXT: v_mov_b32_e32 v41, v16 ; GFX10-NEXT: v_mov_b32_e32 v42, v15 -; GFX10-NEXT: v_writelane_b32 v40, s30, 0 ; GFX10-NEXT: v_mov_b32_e32 v43, v14 +; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: v_mov_b32_e32 v44, v13 ; GFX10-NEXT: v_mov_b32_e32 v45, v12 -; GFX10-NEXT: v_writelane_b32 v40, s31, 1 ; GFX10-NEXT: s_waitcnt vmcnt(0) ; GFX10-NEXT: global_store_dwordx4 v[0:1], v[0:3], off ; GFX10-NEXT: s_waitcnt lgkmcnt(0) @@ -288,9 +300,11 @@ ; GFX10-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:16 ; GFX10-NEXT: v_readlane_b32 s31, v40, 1 ; GFX10-NEXT: v_readlane_b32 s30, v40, 0 -; GFX10-NEXT: v_readlane_b32 s4, v40, 2 +; GFX10-NEXT: v_readlane_b32 s4, v46, 0 ; GFX10-NEXT: s_or_saveexec_b32 s5, -1 -; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX10-NEXT: s_clause 0x1 +; GFX10-NEXT: buffer_load_dword v40, off, s[0:3], s33 offset:20 +; GFX10-NEXT: buffer_load_dword v46, off, s[0:3], s33 offset:24 ; GFX10-NEXT: s_waitcnt_depctr 0xffe3 ; GFX10-NEXT: s_mov_b32 exec_lo, s5 ; GFX10-NEXT: s_addk_i32 s32, 0xfc00 @@ -304,7 +318,9 @@ ; GFX11-NEXT: s_mov_b32 s0, s33 ; GFX11-NEXT: s_mov_b32 s33, s32 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20 ; 4-byte Folded Spill +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_store_b32 off, v40, s33 offset:20 +; GFX11-NEXT: scratch_store_b32 off, v46, s33 offset:24 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_clause 0x4 ; GFX11-NEXT: scratch_store_b32 off, v41, s33 offset:16 @@ -314,16 +330,16 @@ ; GFX11-NEXT: scratch_store_b32 off, v45, s33 ; GFX11-NEXT: image_gather4_c_b_cl v[0:3], v[12:16], s[0:7], s[0:3] dmask:0x1 dim:SQ_RSRC_IMG_2D ; GFX11-NEXT: s_add_i32 s32, s32, 32 -; GFX11-NEXT: v_writelane_b32 v40, s0, 2 +; GFX11-NEXT: v_writelane_b32 v46, s0, 0 ; GFX11-NEXT: s_getpc_b64 s[0:1] ; GFX11-NEXT: s_add_u32 s0, s0, extern_func@gotpcrel32@lo+4 ; GFX11-NEXT: s_addc_u32 s1, s1, extern_func@gotpcrel32@hi+12 -; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 -; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 ; GFX11-NEXT: v_writelane_b32 v40, s30, 0 +; GFX11-NEXT: s_load_b64 s[0:1], s[0:1], 0x0 +; GFX11-NEXT: v_dual_mov_b32 v41, v16 :: v_dual_mov_b32 v42, v15 ; GFX11-NEXT: v_dual_mov_b32 v43, v14 :: v_dual_mov_b32 v44, v13 -; GFX11-NEXT: v_mov_b32_e32 v45, v12 ; GFX11-NEXT: v_writelane_b32 v40, s31, 1 +; GFX11-NEXT: v_mov_b32_e32 v45, v12 ; GFX11-NEXT: s_waitcnt vmcnt(0) ; GFX11-NEXT: global_store_b128 v[0:1], v[0:3], off ; GFX11-NEXT: s_waitcnt lgkmcnt(0) @@ -337,9 +353,11 @@ ; GFX11-NEXT: scratch_load_b32 v41, off, s33 offset:16 ; GFX11-NEXT: v_readlane_b32 s31, v40, 1 ; GFX11-NEXT: v_readlane_b32 s30, v40, 0 -; GFX11-NEXT: v_readlane_b32 s0, v40, 2 +; GFX11-NEXT: v_readlane_b32 s0, v46, 0 ; GFX11-NEXT: s_or_saveexec_b32 s1, -1 -; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20 ; 4-byte Folded Reload +; GFX11-NEXT: s_clause 0x1 +; GFX11-NEXT: scratch_load_b32 v40, off, s33 offset:20 +; GFX11-NEXT: scratch_load_b32 v46, off, s33 offset:24 ; GFX11-NEXT: s_mov_b32 exec_lo, s1 ; GFX11-NEXT: s_addk_i32 s32, 0xffe0 ; GFX11-NEXT: s_mov_b32 s33, s0 diff --git a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll --- a/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll +++ b/llvm/test/CodeGen/AMDGPU/vgpr_constant_to_sgpr.ll @@ -9,21 +9,17 @@ define protected amdgpu_kernel void @kern(ptr %addr) !llvm.amdgcn.lds.kernel.id !0 { ; CHECK-LABEL: kern: ; CHECK: ; %bb.0: -; CHECK-NEXT: s_mov_b32 s32, 0x200 +; CHECK-NEXT: s_mov_b32 s32, 0 ; CHECK-NEXT: s_add_u32 s12, s12, s17 ; CHECK-NEXT: s_addc_u32 s13, s13, 0 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_LO), s12 ; CHECK-NEXT: s_setreg_b32 hwreg(HW_REG_FLAT_SCR_HI), s13 ; CHECK-NEXT: s_add_u32 s0, s0, s17 ; CHECK-NEXT: s_addc_u32 s1, s1, 0 -; CHECK-NEXT: ; implicit-def: $vgpr3 -; CHECK-NEXT: v_writelane_b32 v3, s16, 0 -; CHECK-NEXT: s_or_saveexec_b32 s33, -1 -; CHECK-NEXT: buffer_store_dword v3, off, s[0:3], 0 offset:4 ; 4-byte Folded Spill -; CHECK-NEXT: s_mov_b32 exec_lo, s33 +; CHECK-NEXT: v_writelane_b32 v40, s16, 0 ; CHECK-NEXT: s_mov_b32 s13, s15 ; CHECK-NEXT: s_mov_b32 s12, s14 -; CHECK-NEXT: v_readlane_b32 s14, v3, 0 +; CHECK-NEXT: v_readlane_b32 s14, v40, 0 ; CHECK-NEXT: s_mov_b64 s[16:17], s[8:9] ; CHECK-NEXT: s_load_dwordx2 s[8:9], s[16:17], 0x0 ; CHECK-NEXT: v_mov_b32_e32 v5, 42 @@ -56,10 +52,6 @@ ; CHECK-NEXT: s_mov_b64 s[2:3], s[22:23] ; CHECK-NEXT: s_waitcnt lgkmcnt(0) ; CHECK-NEXT: s_swappc_b64 s[30:31], s[16:17] -; CHECK-NEXT: s_or_saveexec_b32 s33, -1 -; CHECK-NEXT: buffer_load_dword v0, off, s[0:3], 0 offset:4 ; 4-byte Folded Reload -; CHECK-NEXT: s_mov_b32 exec_lo, s33 -; CHECK-NEXT: ; kill: killed $vgpr0 ; CHECK-NEXT: s_endpgm store i32 42, ptr %addr call fastcc void @unknown_call() diff --git a/llvm/test/CodeGen/AMDGPU/wave32.ll b/llvm/test/CodeGen/AMDGPU/wave32.ll --- a/llvm/test/CodeGen/AMDGPU/wave32.ll +++ b/llvm/test/CodeGen/AMDGPU/wave32.ll @@ -2856,23 +2856,26 @@ ; GFX1032-NEXT: s_mov_b32 s33, s32 ; GFX1032-NEXT: s_or_saveexec_b32 s17, -1 ; GFX1032-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX1032-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_mov_b32 exec_lo, s17 ; GFX1032-NEXT: s_addk_i32 s32, 0x200 -; GFX1032-NEXT: v_writelane_b32 v40, s16, 2 +; GFX1032-NEXT: v_writelane_b32 v41, s16, 0 ; GFX1032-NEXT: s_getpc_b64 s[16:17] ; GFX1032-NEXT: s_add_u32 s16, s16, external_void_func_void@gotpcrel32@lo+4 ; GFX1032-NEXT: s_addc_u32 s17, s17, external_void_func_void@gotpcrel32@hi+12 -; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX1032-NEXT: v_writelane_b32 v40, s30, 0 +; GFX1032-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX1032-NEXT: v_writelane_b32 v40, s31, 1 ; GFX1032-NEXT: s_waitcnt lgkmcnt(0) ; GFX1032-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1032-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1032-NEXT: v_readlane_b32 s30, v40, 0 -; GFX1032-NEXT: v_readlane_b32 s4, v40, 2 +; GFX1032-NEXT: v_readlane_b32 s4, v41, 0 ; GFX1032-NEXT: s_or_saveexec_b32 s5, -1 -; GFX1032-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX1032-NEXT: s_clause 0x1 +; GFX1032-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX1032-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX1032-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1032-NEXT: s_mov_b32 exec_lo, s5 ; GFX1032-NEXT: s_addk_i32 s32, 0xfe00 @@ -2887,23 +2890,26 @@ ; GFX1064-NEXT: s_mov_b32 s33, s32 ; GFX1064-NEXT: s_or_saveexec_b64 s[18:19], -1 ; GFX1064-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill +; GFX1064-NEXT: buffer_store_dword v41, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, s[18:19] ; GFX1064-NEXT: s_addk_i32 s32, 0x400 -; GFX1064-NEXT: v_writelane_b32 v40, s16, 2 +; GFX1064-NEXT: v_writelane_b32 v41, s16, 0 ; GFX1064-NEXT: s_getpc_b64 s[16:17] ; GFX1064-NEXT: s_add_u32 s16, s16, external_void_func_void@gotpcrel32@lo+4 ; GFX1064-NEXT: s_addc_u32 s17, s17, external_void_func_void@gotpcrel32@hi+12 -; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX1064-NEXT: v_writelane_b32 v40, s30, 0 +; GFX1064-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 ; GFX1064-NEXT: v_writelane_b32 v40, s31, 1 ; GFX1064-NEXT: s_waitcnt lgkmcnt(0) ; GFX1064-NEXT: s_swappc_b64 s[30:31], s[16:17] ; GFX1064-NEXT: v_readlane_b32 s31, v40, 1 ; GFX1064-NEXT: v_readlane_b32 s30, v40, 0 -; GFX1064-NEXT: v_readlane_b32 s4, v40, 2 +; GFX1064-NEXT: v_readlane_b32 s4, v41, 0 ; GFX1064-NEXT: s_or_saveexec_b64 s[6:7], -1 -; GFX1064-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload +; GFX1064-NEXT: s_clause 0x1 +; GFX1064-NEXT: buffer_load_dword v40, off, s[0:3], s33 +; GFX1064-NEXT: buffer_load_dword v41, off, s[0:3], s33 offset:4 ; GFX1064-NEXT: s_waitcnt_depctr 0xffe3 ; GFX1064-NEXT: s_mov_b64 exec, s[6:7] ; GFX1064-NEXT: s_addk_i32 s32, 0xfc00 diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-copy.ll +++ /dev/null @@ -1,68 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -verify-machineinstrs < %s | FileCheck -check-prefix=GFX90A %s - -; The test forces a high vector register pressure and there won't be sufficient VGPRs to be allocated -; for writelane/readlane SGPR spill instructions. Regalloc would split the vector register liverange -; by introducing a copy to AGPR register. The VGPR store to AGPR (v_accvgpr_write_b32) and later the -; restore from AGPR (v_accvgpr_read_b32) should be whole-wave operations and hence exec mask should be -; manipulated to ensure all lanes are active when these instructions are executed. -define void @vector_reg_liverange_split() #0 { -; GFX90A-LABEL: vector_reg_liverange_split: -; GFX90A: ; %bb.0: -; GFX90A-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX90A-NEXT: s_mov_b32 s16, s33 -; GFX90A-NEXT: s_mov_b32 s33, s32 -; GFX90A-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GFX90A-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX90A-NEXT: s_mov_b64 exec, -1 -; GFX90A-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX90A-NEXT: buffer_store_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX90A-NEXT: s_mov_b64 exec, s[18:19] -; GFX90A-NEXT: v_writelane_b32 v40, s16, 2 -; GFX90A-NEXT: ; implicit-def: $vgpr0 -; GFX90A-NEXT: v_writelane_b32 v40, s30, 0 -; GFX90A-NEXT: s_addk_i32 s32, 0x400 -; GFX90A-NEXT: v_writelane_b32 v40, s31, 1 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; def s20 -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_writelane_b32 v0, s20, 0 -; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GFX90A-NEXT: v_accvgpr_write_b32 a32, v0 -; GFX90A-NEXT: s_mov_b64 exec, s[28:29] -; GFX90A-NEXT: s_getpc_b64 s[16:17] -; GFX90A-NEXT: s_add_u32 s16, s16, foo@gotpcrel32@lo+4 -; GFX90A-NEXT: s_addc_u32 s17, s17, foo@gotpcrel32@hi+12 -; GFX90A-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GFX90A-NEXT: s_waitcnt lgkmcnt(0) -; GFX90A-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GFX90A-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GFX90A-NEXT: v_accvgpr_read_b32 v0, a32 -; GFX90A-NEXT: s_mov_b64 exec, s[28:29] -; GFX90A-NEXT: v_readlane_b32 s20, v0, 0 -; GFX90A-NEXT: ;;#ASMSTART -; GFX90A-NEXT: ; use s20 -; GFX90A-NEXT: ;;#ASMEND -; GFX90A-NEXT: v_readlane_b32 s31, v40, 1 -; GFX90A-NEXT: v_readlane_b32 s30, v40, 0 -; GFX90A-NEXT: ; kill: killed $vgpr0 -; GFX90A-NEXT: v_readlane_b32 s4, v40, 2 -; GFX90A-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GFX90A-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX90A-NEXT: s_mov_b64 exec, -1 -; GFX90A-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX90A-NEXT: buffer_load_dword a32, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX90A-NEXT: s_mov_b64 exec, s[6:7] -; GFX90A-NEXT: s_addk_i32 s32, 0xfc00 -; GFX90A-NEXT: s_mov_b32 s33, s4 -; GFX90A-NEXT: s_waitcnt vmcnt(0) -; GFX90A-NEXT: s_setpc_b64 s[30:31] - %s20 = call i32 asm sideeffect "; def $0","=${s20}"() - call void @foo() - call void asm sideeffect "; use $0","${s20}"(i32 %s20) - ret void -} - -declare void @foo() - -attributes #0 = { "amdgpu-num-vgpr"="41" "amdgpu-num-sgpr"="34"} diff --git a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll b/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll deleted file mode 100644 --- a/llvm/test/CodeGen/AMDGPU/whole-wave-register-spill.ll +++ /dev/null @@ -1,140 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN %s -; RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx906 -O0 --verify-machineinstrs -o - %s | FileCheck -check-prefix=GCN-O0 %s - -; Test whole-wave register spilling. - -; In this testcase, the return address registers, PC value (SGPR30_SGPR31) and the scratch SGPR used in -; the inline asm statements should be preserved across the call. Since the test limits the VGPR numbers, -; the PC will be spilled to the only available CSR VGPR (VGPR40) as we spill CSR SGPRs including the PC -; directly to the physical VGPR lane to correctly generate the CFIs. The SGPR20 will get spilled to the -; virtual VGPR lane and that would be allocated by regalloc. Since there is no free VGPR to allocate, RA -; must spill a scratch VGPR. The writelane/readlane instructions that spill/restore SGPRs into/from VGPR -; are whole-wave operations and hence the VGPRs involved in such operations require whole-wave spilling. - -define void @test() #0 { -; GCN-LABEL: test: -; GCN: ; %bb.0: -; GCN-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-NEXT: s_mov_b32 s16, s33 -; GCN-NEXT: s_mov_b32 s33, s32 -; GCN-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-NEXT: buffer_store_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[18:19] -; GCN-NEXT: v_writelane_b32 v40, s28, 2 -; GCN-NEXT: v_writelane_b32 v40, s29, 3 -; GCN-NEXT: v_writelane_b32 v40, s16, 4 -; GCN-NEXT: ; implicit-def: $vgpr0 -; GCN-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-NEXT: s_addk_i32 s32, 0x800 -; GCN-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-NEXT: ;;#ASMSTART -; GCN-NEXT: ; def s16 -; GCN-NEXT: ;;#ASMEND -; GCN-NEXT: v_writelane_b32 v0, s16, 0 -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: s_getpc_b64 s[16:17] -; GCN-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 -; GCN-NEXT: s_addc_u32 s17, s17, ext_func@gotpcrel32@hi+12 -; GCN-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GCN-NEXT: s_waitcnt lgkmcnt(0) -; GCN-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[28:29] -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s4, v1, 0 -; GCN-NEXT: v_mov_b32_e32 v0, s4 -; GCN-NEXT: global_store_dword v[0:1], v0, off -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-NEXT: ; kill: killed $vgpr1 -; GCN-NEXT: v_readlane_b32 s28, v40, 2 -; GCN-NEXT: v_readlane_b32 s29, v40, 3 -; GCN-NEXT: v_readlane_b32 s4, v40, 4 -; GCN-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GCN-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-NEXT: buffer_load_dword v1, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, -1 -; GCN-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-NEXT: s_mov_b64 exec, s[6:7] -; GCN-NEXT: s_addk_i32 s32, 0xf800 -; GCN-NEXT: s_mov_b32 s33, s4 -; GCN-NEXT: s_waitcnt vmcnt(0) -; GCN-NEXT: s_setpc_b64 s[30:31] -; -; GCN-O0-LABEL: test: -; GCN-O0: ; %bb.0: -; GCN-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GCN-O0-NEXT: s_mov_b32 s16, s33 -; GCN-O0-NEXT: s_mov_b32 s33, s32 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[18:19], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, -1 -; GCN-O0-NEXT: buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[18:19] -; GCN-O0-NEXT: v_writelane_b32 v40, s28, 2 -; GCN-O0-NEXT: v_writelane_b32 v40, s29, 3 -; GCN-O0-NEXT: v_writelane_b32 v40, s16, 4 -; GCN-O0-NEXT: s_add_i32 s32, s32, 0x400 -; GCN-O0-NEXT: ; implicit-def: $vgpr0 -; GCN-O0-NEXT: v_writelane_b32 v40, s30, 0 -; GCN-O0-NEXT: v_writelane_b32 v40, s31, 1 -; GCN-O0-NEXT: ;;#ASMSTART -; GCN-O0-NEXT: ; def s16 -; GCN-O0-NEXT: ;;#ASMEND -; GCN-O0-NEXT: v_writelane_b32 v0, s16, 0 -; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GCN-O0-NEXT: s_mov_b64 exec, s[28:29] -; GCN-O0-NEXT: s_getpc_b64 s[16:17] -; GCN-O0-NEXT: s_add_u32 s16, s16, ext_func@gotpcrel32@lo+4 -; GCN-O0-NEXT: s_addc_u32 s17, s17, ext_func@gotpcrel32@hi+12 -; GCN-O0-NEXT: s_load_dwordx2 s[16:17], s[16:17], 0x0 -; GCN-O0-NEXT: s_mov_b64 s[22:23], s[2:3] -; GCN-O0-NEXT: s_mov_b64 s[20:21], s[0:1] -; GCN-O0-NEXT: s_mov_b64 s[0:1], s[20:21] -; GCN-O0-NEXT: s_mov_b64 s[2:3], s[22:23] -; GCN-O0-NEXT: s_waitcnt lgkmcnt(0) -; GCN-O0-NEXT: s_swappc_b64 s[30:31], s[16:17] -; GCN-O0-NEXT: s_or_saveexec_b64 s[28:29], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[28:29] -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s4, v0, 0 -; GCN-O0-NEXT: ; implicit-def: $sgpr6_sgpr7 -; GCN-O0-NEXT: v_mov_b32_e32 v1, s6 -; GCN-O0-NEXT: v_mov_b32_e32 v2, s7 -; GCN-O0-NEXT: v_mov_b32_e32 v3, s4 -; GCN-O0-NEXT: global_store_dword v[1:2], v3, off -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: v_readlane_b32 s31, v40, 1 -; GCN-O0-NEXT: v_readlane_b32 s30, v40, 0 -; GCN-O0-NEXT: ; kill: killed $vgpr0 -; GCN-O0-NEXT: v_readlane_b32 s28, v40, 2 -; GCN-O0-NEXT: v_readlane_b32 s29, v40, 3 -; GCN-O0-NEXT: v_readlane_b32 s4, v40, 4 -; GCN-O0-NEXT: s_xor_saveexec_b64 s[6:7], -1 -; GCN-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, -1 -; GCN-O0-NEXT: buffer_load_dword v40, off, s[0:3], s33 ; 4-byte Folded Reload -; GCN-O0-NEXT: s_mov_b64 exec, s[6:7] -; GCN-O0-NEXT: s_add_i32 s32, s32, 0xfffffc00 -; GCN-O0-NEXT: s_mov_b32 s33, s4 -; GCN-O0-NEXT: s_waitcnt vmcnt(0) -; GCN-O0-NEXT: s_setpc_b64 s[30:31] - %sgpr = call i32 asm sideeffect "; def $0", "=s" () #0 - call void @ext_func() - store volatile i32 %sgpr, ptr addrspace(1) undef - ret void -} - -declare void @ext_func(); - -attributes #0 = { nounwind "amdgpu-num-vgpr"="41" "amdgpu-num-sgpr"="34"} diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved-spill.ll @@ -141,15 +141,10 @@ ; GFX9-O0: ; %bb.0: ; %entry ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: ; implicit-def: $vgpr3 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: s_mov_b32 s40, s6 ; GFX9-O0-NEXT: s_mov_b32 s34, s4 ; GFX9-O0-NEXT: ; kill: def $sgpr40 killed $sgpr40 def $sgpr40_sgpr41 @@ -164,18 +159,17 @@ ; GFX9-O0-NEXT: s_mov_b32 s37, s44 ; GFX9-O0-NEXT: s_mov_b32 s38, s43 ; GFX9-O0-NEXT: s_mov_b32 s39, s42 -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 3 +; GFX9-O0-NEXT: v_writelane_b32 v3, s40, 0 +; GFX9-O0-NEXT: v_writelane_b32 v3, s41, 1 +; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 2 +; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 ; GFX9-O0-NEXT: s_nop 2 ; GFX9-O0-NEXT: buffer_load_dwordx2 v[4:5], off, s[36:39], s34 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v4 ; GFX9-O0-NEXT: s_not_b64 exec, exec @@ -188,24 +182,21 @@ ; GFX9-O0-NEXT: v_add_u32_e64 v1, v1, v2 ; GFX9-O0-NEXT: s_mov_b64 exec, s[36:37] ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v1 -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, s34 -; GFX9-O0-NEXT: v_mov_b32_e32 v3, s34 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, s34 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, s34 ; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: s_mov_b64 s[34:35], exec +; GFX9-O0-NEXT: v_writelane_b32 v3, s34, 4 +; GFX9-O0-NEXT: v_writelane_b32 v3, s35, 5 ; GFX9-O0-NEXT: s_and_b64 s[34:35], s[34:35], s[36:37] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_cbranch_execz .LBB1_2 ; GFX9-O0-NEXT: ; %bb.1: ; %if -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:12 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_mov_b32_e32 v0, v4 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v5 ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, 0 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] @@ -218,28 +209,25 @@ ; GFX9-O0-NEXT: v_add_u32_e64 v1, v2, v1 ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: v_mov_b32_e32 v0, v1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 ; 4-byte Folded Spill ; GFX9-O0-NEXT: .LBB1_2: ; %merge -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_waitcnt vmcnt(2) -; GFX9-O0-NEXT: v_readlane_b32 s36, v0, 4 -; GFX9-O0-NEXT: v_readlane_b32 s37, v0, 5 +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: s_nop 0 +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: v_readlane_b32 s36, v3, 4 +; GFX9-O0-NEXT: v_readlane_b32 s37, v3, 5 ; GFX9-O0-NEXT: s_or_b64 exec, exec, s[36:37] -; GFX9-O0-NEXT: v_readlane_b32 s38, v0, 0 -; GFX9-O0-NEXT: v_readlane_b32 s39, v0, 1 -; GFX9-O0-NEXT: v_readlane_b32 s34, v0, 2 -; GFX9-O0-NEXT: v_readlane_b32 s35, v0, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v3, 0 +; GFX9-O0-NEXT: v_readlane_b32 s39, v3, 1 +; GFX9-O0-NEXT: v_readlane_b32 s34, v3, 2 +; GFX9-O0-NEXT: v_readlane_b32 s35, v3, 3 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v3, v4 -; GFX9-O0-NEXT: v_cndmask_b32_e64 v3, 0, 1, s[36:37] +; GFX9-O0-NEXT: v_cmp_eq_u32_e64 s[36:37], v0, v4 +; GFX9-O0-NEXT: v_cndmask_b32_e64 v0, 0, 1, s[36:37] ; GFX9-O0-NEXT: s_mov_b32 s36, 1 -; GFX9-O0-NEXT: v_lshlrev_b32_e64 v3, s36, v3 +; GFX9-O0-NEXT: v_lshlrev_b32_e64 v0, s36, v0 ; GFX9-O0-NEXT: s_mov_b32 s36, 2 -; GFX9-O0-NEXT: v_and_b32_e64 v3, v3, s36 +; GFX9-O0-NEXT: v_and_b32_e64 v0, v0, s36 ; GFX9-O0-NEXT: s_mov_b32 s40, s35 ; GFX9-O0-NEXT: s_mov_b32 s36, s34 ; GFX9-O0-NEXT: s_mov_b32 s34, s39 @@ -249,12 +237,11 @@ ; GFX9-O0-NEXT: s_mov_b32 s38, s35 ; GFX9-O0-NEXT: s_mov_b32 s39, s34 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[36:39], s34 offset:4 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s32 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:24 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] @@ -559,28 +546,25 @@ ; GFX9-O0-LABEL: strict_wwm_call_i64: ; GFX9-O0: ; %bb.0: ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0) -; GFX9-O0-NEXT: s_mov_b32 s48, s33 +; GFX9-O0-NEXT: s_mov_b32 s46, s33 ; GFX9-O0-NEXT: s_mov_b32 s33, s32 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s33 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0x1000 -; GFX9-O0-NEXT: ; implicit-def: $vgpr0 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xc00 ; GFX9-O0-NEXT: v_writelane_b32 v10, s30, 0 ; GFX9-O0-NEXT: v_writelane_b32 v10, s31, 1 ; GFX9-O0-NEXT: s_mov_b32 s34, s8 @@ -598,10 +582,10 @@ ; GFX9-O0-NEXT: s_mov_b32 s41, s45 ; GFX9-O0-NEXT: s_mov_b32 s42, s44 ; GFX9-O0-NEXT: s_mov_b32 s43, s35 -; GFX9-O0-NEXT: v_writelane_b32 v0, s40, 0 -; GFX9-O0-NEXT: v_writelane_b32 v0, s41, 1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s42, 2 -; GFX9-O0-NEXT: v_writelane_b32 v0, s43, 3 +; GFX9-O0-NEXT: v_writelane_b32 v10, s40, 2 +; GFX9-O0-NEXT: v_writelane_b32 v10, s41, 3 +; GFX9-O0-NEXT: v_writelane_b32 v10, s42, 4 +; GFX9-O0-NEXT: v_writelane_b32 v10, s43, 5 ; GFX9-O0-NEXT: ; kill: def $sgpr34 killed $sgpr34 def $sgpr34_sgpr35 ; GFX9-O0-NEXT: s_mov_b32 s35, s9 ; GFX9-O0-NEXT: ; kill: def $sgpr36_sgpr37 killed $sgpr34_sgpr35 @@ -613,11 +597,8 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v9, s37 ; GFX9-O0-NEXT: s_not_b64 exec, exec ; GFX9-O0-NEXT: s_or_saveexec_b64 s[34:35], -1 -; GFX9-O0-NEXT: v_writelane_b32 v0, s34, 4 -; GFX9-O0-NEXT: v_writelane_b32 v0, s35, 5 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Spill -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] +; GFX9-O0-NEXT: v_writelane_b32 v10, s34, 6 +; GFX9-O0-NEXT: v_writelane_b32 v10, s35, 7 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v8 ; GFX9-O0-NEXT: s_mov_b32 s34, 32 ; GFX9-O0-NEXT: ; implicit-def: $sgpr36_sgpr37 @@ -634,20 +615,13 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_waitcnt lgkmcnt(0) ; GFX9-O0-NEXT: s_swappc_b64 s[30:31], s[34:35] -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] -; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: v_readlane_b32 s34, v6, 4 -; GFX9-O0-NEXT: v_readlane_b32 s35, v6, 5 -; GFX9-O0-NEXT: v_readlane_b32 s36, v6, 0 -; GFX9-O0-NEXT: v_readlane_b32 s37, v6, 1 -; GFX9-O0-NEXT: v_readlane_b32 s38, v6, 2 -; GFX9-O0-NEXT: v_readlane_b32 s39, v6, 3 +; GFX9-O0-NEXT: v_readlane_b32 s34, v10, 6 +; GFX9-O0-NEXT: v_readlane_b32 s35, v10, 7 +; GFX9-O0-NEXT: v_readlane_b32 s36, v10, 2 +; GFX9-O0-NEXT: v_readlane_b32 s37, v10, 3 +; GFX9-O0-NEXT: v_readlane_b32 s38, v10, 4 +; GFX9-O0-NEXT: v_readlane_b32 s39, v10, 5 ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v0 -; GFX9-O0-NEXT: s_or_saveexec_b64 s[46:47], -1 -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload -; GFX9-O0-NEXT: s_mov_b64 exec, s[46:47] ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v1 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 ; GFX9-O0-NEXT: ; implicit-def: $sgpr40 @@ -656,30 +630,27 @@ ; GFX9-O0-NEXT: v_add_co_u32_e64 v2, s[40:41], v2, v4 ; GFX9-O0-NEXT: v_addc_co_u32_e64 v3, s[40:41], v3, v5, s[40:41] ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: v_mov_b32_e32 v6, v2 -; GFX9-O0-NEXT: v_mov_b32_e32 v7, v3 +; GFX9-O0-NEXT: v_mov_b32_e32 v0, v2 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, v3 ; GFX9-O0-NEXT: s_mov_b32 s34, 0 -; GFX9-O0-NEXT: buffer_store_dwordx2 v[6:7], off, s[36:39], s34 offset:4 +; GFX9-O0-NEXT: buffer_store_dwordx2 v[0:1], off, s[36:39], s34 offset:4 ; GFX9-O0-NEXT: v_readlane_b32 s31, v10, 1 ; GFX9-O0-NEXT: v_readlane_b32 s30, v10, 0 -; GFX9-O0-NEXT: ; kill: killed $vgpr0 ; GFX9-O0-NEXT: s_xor_saveexec_b64 s[34:35], -1 ; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s33 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:44 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:48 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s33 offset:4 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s33 offset:8 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:12 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:16 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:20 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:24 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s33 offset:28 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v3, off, s[0:3], s33 offset:32 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s33 offset:36 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s33 offset:40 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] -; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff000 -; GFX9-O0-NEXT: s_mov_b32 s33, s48 +; GFX9-O0-NEXT: s_add_i32 s32, s32, 0xfffff400 +; GFX9-O0-NEXT: s_mov_b32 s33, s46 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] ; @@ -938,7 +909,7 @@ ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: buffer_store_dword v40, off, s[0:3], s32 offset:48 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_store_dword v41, off, s[0:3], s32 offset:44 ; 4-byte Folded Spill @@ -951,35 +922,35 @@ ; GFX9-O0-NEXT: v_writelane_b32 v42, s65, 1 ; GFX9-O0-NEXT: v_writelane_b32 v42, s66, 2 ; GFX9-O0-NEXT: v_writelane_b32 v42, s67, 3 -; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v10, off, s[0:3], s32 offset:68 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v9, off, s[0:3], s32 offset:60 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v8, off, s[0:3], s32 offset:76 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v7, off, s[0:3], s32 offset:64 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v6, off, s[0:3], s32 offset:84 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v4, off, s[0:3], s32 offset:92 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v3, off, s[0:3], s32 offset:80 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v2, off, s[0:3], s32 offset:52 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:88 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:20 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:16 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:116 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:12 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:112 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:8 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:108 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:4 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:104 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill ; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) -; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:100 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v0, off, s[0:3], s32 offset:96 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 @@ -989,145 +960,145 @@ ; GFX9-O0-NEXT: v_mov_b32_e32 v0, s4 ; GFX9-O0-NEXT: v_mov_b32_e32 v43, s5 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s6 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:200 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:196 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s7 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:192 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s8 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:188 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s9 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:184 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s10 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:180 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s11 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:176 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s12 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:172 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s13 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:168 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s14 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:164 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s15 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:160 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s16 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:156 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s17 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:152 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s18 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:148 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s19 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:144 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s20 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:140 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s21 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:136 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s22 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:132 ; 4-byte Folded Spill -; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s23 ; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:128 ; 4-byte Folded Spill +; GFX9-O0-NEXT: v_mov_b32_e32 v1, s24 +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill ; GFX9-O0-NEXT: v_mov_b32_e32 v47, s25 ; GFX9-O0-NEXT: v_mov_b32_e32 v46, s26 ; GFX9-O0-NEXT: v_mov_b32_e32 v45, s27 ; GFX9-O0-NEXT: v_mov_b32_e32 v44, s28 ; GFX9-O0-NEXT: v_mov_b32_e32 v1, s29 -; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:124 ; 4-byte Folded Spill +; GFX9-O0-NEXT: buffer_store_dword v1, off, s[0:3], s32 offset:120 ; 4-byte Folded Spill ; GFX9-O0-NEXT: ; kill: def $vgpr0 killed $vgpr0 def $vgpr0_vgpr1_vgpr2_vgpr3_vgpr4_vgpr5_vgpr6_vgpr7_vgpr8_vgpr9_vgpr10_vgpr11_vgpr12_vgpr13_vgpr14_vgpr15_vgpr16_vgpr17_vgpr18_vgpr19_vgpr20_vgpr21_vgpr22_vgpr23_vgpr24_vgpr25_vgpr26_vgpr27_vgpr28_vgpr29_vgpr30_vgpr31 killed $exec ; GFX9-O0-NEXT: v_mov_b32_e32 v1, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v2, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:196 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v3, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:192 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v4, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:188 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v5, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:184 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v6, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:180 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v7, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:176 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v8, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:172 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v9, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:168 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v10, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:164 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v11, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:160 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v12, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:156 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v13, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:152 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v14, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:148 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v15, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:144 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v16, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:140 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v17, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:136 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v18, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:132 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v19, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:128 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v20, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:124 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v21, v47 -; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v47, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v22, v46 -; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v46, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v23, v45 -; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v45, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v24, v44 -; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v44, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(4) ; GFX9-O0-NEXT: v_mov_b32_e32 v25, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: v_mov_b32_e32 v26, v43 -; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v43, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload ; GFX9-O0-NEXT: v_mov_b32_e32 v27, v47 ; GFX9-O0-NEXT: v_mov_b32_e32 v28, v46 ; GFX9-O0-NEXT: v_mov_b32_e32 v29, v45 ; GFX9-O0-NEXT: v_mov_b32_e32 v30, v44 ; GFX9-O0-NEXT: ; kill: def $vgpr31 killed $vgpr43 killed $exec -; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:120 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload -; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v31, off, s[0:3], s32 offset:116 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v30, off, s[0:3], s32 offset:112 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v29, off, s[0:3], s32 offset:108 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v28, off, s[0:3], s32 offset:104 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v27, off, s[0:3], s32 offset:100 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v26, off, s[0:3], s32 offset:96 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v11, off, s[0:3], s32 offset:92 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v10, off, s[0:3], s32 offset:88 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v9, off, s[0:3], s32 offset:84 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v8, off, s[0:3], s32 offset:80 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v7, off, s[0:3], s32 offset:76 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v6, off, s[0:3], s32 offset:72 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v5, off, s[0:3], s32 offset:68 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v4, off, s[0:3], s32 offset:64 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v2, off, s[0:3], s32 offset:60 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v1, off, s[0:3], s32 offset:56 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v0, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; implicit-def: $sgpr34 ; GFX9-O0-NEXT: ; kill: def $vgpr2 killed $vgpr2 def $vgpr2_vgpr3 killed $exec @@ -1256,7 +1227,7 @@ ; GFX9-O0-NEXT: buffer_load_dword v40, off, s[0:3], s32 offset:236 ; 4-byte Folded Reload ; GFX9-O0-NEXT: buffer_load_dword v41, off, s[0:3], s32 offset:240 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, -1 -; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:52 ; 4-byte Folded Reload +; GFX9-O0-NEXT: buffer_load_dword v42, off, s[0:3], s32 offset:200 ; 4-byte Folded Reload ; GFX9-O0-NEXT: s_mov_b64 exec, s[34:35] ; GFX9-O0-NEXT: s_waitcnt vmcnt(0) ; GFX9-O0-NEXT: s_setpc_b64 s[30:31] diff --git a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll --- a/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll +++ b/llvm/test/CodeGen/AMDGPU/wwm-reserved.ll @@ -125,8 +125,7 @@ ; GFX9-O0: v_add_u32_e64 v3, v3, v6 %tmp136 = add i32 %tmp134, %tmp107 %tmp137 = tail call i32 @llvm.amdgcn.wwm.i32(i32 %tmp136) -; GFX9-O0: buffer_store_dword v1 -; GFX9-O3: buffer_store_dword v0 +; GFX9: buffer_store_dword v0 call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } @@ -321,8 +320,7 @@ ; GFX9-O0: v_add_u32_e64 v3, v3, v6 %tmp136 = add i32 %tmp134, %tmp107 %tmp137 = tail call i32 @llvm.amdgcn.strict.wwm.i32(i32 %tmp136) -; GFX9-O0: buffer_store_dword v1 -; GFX9-O3: buffer_store_dword v0 +; GFX9: buffer_store_dword v0 call void @llvm.amdgcn.raw.ptr.buffer.store.i32(i32 %tmp137, ptr addrspace(8) %tmp14, i32 4, i32 0, i32 0) ret void } @@ -408,4 +406,4 @@ declare void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float>, ptr addrspace(8), i32, i32, i32) declare void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float>, ptr addrspace(8), i32, i32, i32) declare <2 x i32> @llvm.amdgcn.s.buffer.load.v2i32(<4 x i32>, i32, i32) -declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) \ No newline at end of file +declare <4 x i32> @llvm.amdgcn.s.buffer.load.v4i32(<4 x i32>, i32, i32) diff --git a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir --- a/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir +++ b/llvm/test/CodeGen/MIR/AMDGPU/stack-id-assert.mir @@ -3,7 +3,7 @@ # contains not dead objects only. So using objects IDs as offset in the storage # caused out of bounds access. -# RUN: llc -march=amdgcn -start-before=si-lower-sgpr-spills -stop-after=prologepilog -verify-machineinstrs -o - %s | FileCheck %s +# RUN: llc -march=amdgcn -run-pass=si-lower-sgpr-spills,prologepilog -verify-machineinstrs -o - %s | FileCheck %s # CHECK-LABEL: name: foo # CHECK: {{^}}fixedStack: [] diff --git a/llvm/test/CodeGen/Mips/madd-msub.ll b/llvm/test/CodeGen/Mips/madd-msub.ll --- a/llvm/test/CodeGen/Mips/madd-msub.ll +++ b/llvm/test/CodeGen/Mips/madd-msub.ll @@ -42,22 +42,22 @@ ; ; 64-LABEL: madd1: ; 64: # %bb.0: # %entry -; 64-NEXT: sll $4, $4, 0 -; 64-NEXT: sll $5, $5, 0 -; 64-NEXT: dmult $5, $4 +; 64-NEXT: sll $1, $4, 0 +; 64-NEXT: sll $2, $5, 0 +; 64-NEXT: dmult $2, $1 ; 64-NEXT: mflo $1 -; 64-NEXT: sll $6, $6, 0 +; 64-NEXT: sll $2, $6, 0 ; 64-NEXT: jr $ra -; 64-NEXT: daddu $2, $1, $6 +; 64-NEXT: daddu $2, $1, $2 ; ; 64R6-LABEL: madd1: ; 64R6: # %bb.0: # %entry -; 64R6-NEXT: sll $4, $4, 0 -; 64R6-NEXT: sll $5, $5, 0 -; 64R6-NEXT: dmul $1, $5, $4 -; 64R6-NEXT: sll $6, $6, 0 +; 64R6-NEXT: sll $1, $4, 0 +; 64R6-NEXT: sll $2, $5, 0 +; 64R6-NEXT: dmul $1, $2, $1 +; 64R6-NEXT: sll $2, $6, 0 ; 64R6-NEXT: jr $ra -; 64R6-NEXT: daddu $2, $1, $6 +; 64R6-NEXT: daddu $2, $1, $2 ; ; 16-LABEL: madd1: ; 16: # %bb.0: # %entry @@ -173,18 +173,18 @@ ; ; 64-LABEL: madd3: ; 64: # %bb.0: # %entry -; 64-NEXT: sll $4, $4, 0 -; 64-NEXT: sll $5, $5, 0 -; 64-NEXT: dmult $5, $4 +; 64-NEXT: sll $1, $4, 0 +; 64-NEXT: sll $2, $5, 0 +; 64-NEXT: dmult $2, $1 ; 64-NEXT: mflo $1 ; 64-NEXT: jr $ra ; 64-NEXT: daddu $2, $1, $6 ; ; 64R6-LABEL: madd3: ; 64R6: # %bb.0: # %entry -; 64R6-NEXT: sll $4, $4, 0 -; 64R6-NEXT: sll $5, $5, 0 -; 64R6-NEXT: dmul $1, $5, $4 +; 64R6-NEXT: sll $1, $4, 0 +; 64R6-NEXT: sll $2, $5, 0 +; 64R6-NEXT: dmul $1, $2, $1 ; 64R6-NEXT: jr $ra ; 64R6-NEXT: daddu $2, $1, $6 ; @@ -291,22 +291,22 @@ ; ; 64-LABEL: msub1: ; 64: # %bb.0: # %entry -; 64-NEXT: sll $4, $4, 0 -; 64-NEXT: sll $5, $5, 0 -; 64-NEXT: dmult $5, $4 +; 64-NEXT: sll $1, $4, 0 +; 64-NEXT: sll $2, $5, 0 +; 64-NEXT: dmult $2, $1 ; 64-NEXT: mflo $1 -; 64-NEXT: sll $6, $6, 0 +; 64-NEXT: sll $2, $6, 0 ; 64-NEXT: jr $ra -; 64-NEXT: dsubu $2, $6, $1 +; 64-NEXT: dsubu $2, $2, $1 ; ; 64R6-LABEL: msub1: ; 64R6: # %bb.0: # %entry -; 64R6-NEXT: sll $4, $4, 0 -; 64R6-NEXT: sll $5, $5, 0 -; 64R6-NEXT: dmul $1, $5, $4 -; 64R6-NEXT: sll $6, $6, 0 +; 64R6-NEXT: sll $1, $4, 0 +; 64R6-NEXT: sll $2, $5, 0 +; 64R6-NEXT: dmul $1, $2, $1 +; 64R6-NEXT: sll $2, $6, 0 ; 64R6-NEXT: jr $ra -; 64R6-NEXT: dsubu $2, $6, $1 +; 64R6-NEXT: dsubu $2, $2, $1 ; ; 16-LABEL: msub1: ; 16: # %bb.0: # %entry @@ -424,18 +424,18 @@ ; ; 64-LABEL: msub3: ; 64: # %bb.0: # %entry -; 64-NEXT: sll $4, $4, 0 -; 64-NEXT: sll $5, $5, 0 -; 64-NEXT: dmult $5, $4 +; 64-NEXT: sll $1, $4, 0 +; 64-NEXT: sll $2, $5, 0 +; 64-NEXT: dmult $2, $1 ; 64-NEXT: mflo $1 ; 64-NEXT: jr $ra ; 64-NEXT: dsubu $2, $6, $1 ; ; 64R6-LABEL: msub3: ; 64R6: # %bb.0: # %entry -; 64R6-NEXT: sll $4, $4, 0 -; 64R6-NEXT: sll $5, $5, 0 -; 64R6-NEXT: dmul $1, $5, $4 +; 64R6-NEXT: sll $1, $4, 0 +; 64R6-NEXT: sll $2, $5, 0 +; 64R6-NEXT: dmul $1, $2, $1 ; 64R6-NEXT: jr $ra ; 64R6-NEXT: dsubu $2, $6, $1 ; @@ -546,22 +546,22 @@ ; ; 64-LABEL: msub5: ; 64: # %bb.0: # %entry -; 64-NEXT: sll $4, $4, 0 -; 64-NEXT: sll $5, $5, 0 -; 64-NEXT: dmult $5, $4 +; 64-NEXT: sll $1, $4, 0 +; 64-NEXT: sll $2, $5, 0 +; 64-NEXT: dmult $2, $1 ; 64-NEXT: mflo $1 -; 64-NEXT: sll $6, $6, 0 +; 64-NEXT: sll $2, $6, 0 ; 64-NEXT: jr $ra -; 64-NEXT: dsubu $2, $1, $6 +; 64-NEXT: dsubu $2, $1, $2 ; ; 64R6-LABEL: msub5: ; 64R6: # %bb.0: # %entry -; 64R6-NEXT: sll $4, $4, 0 -; 64R6-NEXT: sll $5, $5, 0 -; 64R6-NEXT: dmul $1, $5, $4 -; 64R6-NEXT: sll $6, $6, 0 +; 64R6-NEXT: sll $1, $4, 0 +; 64R6-NEXT: sll $2, $5, 0 +; 64R6-NEXT: dmul $1, $2, $1 +; 64R6-NEXT: sll $2, $6, 0 ; 64R6-NEXT: jr $ra -; 64R6-NEXT: dsubu $2, $1, $6 +; 64R6-NEXT: dsubu $2, $1, $2 ; ; 16-LABEL: msub5: ; 16: # %bb.0: # %entry diff --git a/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll b/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll --- a/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll +++ b/llvm/test/CodeGen/X86/GlobalISel/add-ext.ll @@ -205,8 +205,8 @@ ; CHECK-NEXT: addq %rdi, %rcx ; CHECK-NEXT: movl (%rcx), %ecx ; CHECK-NEXT: addl (%rax), %ecx -; CHECK-NEXT: movl %esi, %esi -; CHECK-NEXT: imulq $4, %rsi, %rax +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: imulq $4, %rax, %rax ; CHECK-NEXT: addq %rdi, %rax ; CHECK-NEXT: movl %ecx, (%rax) ; CHECK-NEXT: retq diff --git a/llvm/test/CodeGen/X86/dagcombine-cse.ll b/llvm/test/CodeGen/X86/dagcombine-cse.ll --- a/llvm/test/CodeGen/X86/dagcombine-cse.ll +++ b/llvm/test/CodeGen/X86/dagcombine-cse.ll @@ -106,24 +106,24 @@ ; ; X64-LABEL: square_high: ; X64: ## %bb.0: ## %entry -; X64-NEXT: movl %esi, %esi -; X64-NEXT: movq %rsi, %rax +; X64-NEXT: movl %esi, %ecx +; X64-NEXT: movq %rcx, %rax ; X64-NEXT: mulq %rdi -; X64-NEXT: movq %rdx, %rcx +; X64-NEXT: movq %rdx, %rsi ; X64-NEXT: movq %rax, %r8 ; X64-NEXT: movq %rdi, %rax ; X64-NEXT: mulq %rdi ; X64-NEXT: addq %r8, %rdx -; X64-NEXT: movq %rcx, %rax +; X64-NEXT: movq %rsi, %rax ; X64-NEXT: adcq $0, %rax ; X64-NEXT: addq %rdx, %r8 -; X64-NEXT: adcq %rcx, %rax -; X64-NEXT: imulq %rsi, %rsi -; X64-NEXT: addq %rax, %rsi -; X64-NEXT: shrdq $32, %rsi, %r8 -; X64-NEXT: shrq $32, %rsi +; X64-NEXT: adcq %rsi, %rax +; X64-NEXT: imulq %rcx, %rcx +; X64-NEXT: addq %rax, %rcx +; X64-NEXT: shrdq $32, %rcx, %r8 +; X64-NEXT: shrq $32, %rcx ; X64-NEXT: movq %r8, %rax -; X64-NEXT: movq %rsi, %rdx +; X64-NEXT: movq %rcx, %rdx ; X64-NEXT: retq entry: %conv = zext i96 %x to i192 diff --git a/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll b/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll --- a/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll +++ b/llvm/test/CodeGen/X86/fold-and-shift-x86_64.ll @@ -34,8 +34,8 @@ define i8 @t3(ptr %X, i64 %i) { ; CHECK-LABEL: t3: ; CHECK: # %bb.0: # %entry -; CHECK-NEXT: movl %esi, %esi -; CHECK-NEXT: movzbl (%rdi,%rsi,4), %eax +; CHECK-NEXT: movl %esi, %eax +; CHECK-NEXT: movzbl (%rdi,%rax,4), %eax ; CHECK-NEXT: retq entry: diff --git a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll --- a/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll +++ b/llvm/test/CodeGen/X86/unfold-masked-merge-scalar-constmask-lowhigh.ll @@ -78,18 +78,18 @@ define i64 @out64_constmask(i64 %x, i64 %y) { ; CHECK-NOBMI-LABEL: out64_constmask: ; CHECK-NOBMI: # %bb.0: -; CHECK-NOBMI-NEXT: movl %edi, %edi +; CHECK-NOBMI-NEXT: movl %edi, %ecx ; CHECK-NOBMI-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000 ; CHECK-NOBMI-NEXT: andq %rsi, %rax -; CHECK-NOBMI-NEXT: orq %rdi, %rax +; CHECK-NOBMI-NEXT: orq %rcx, %rax ; CHECK-NOBMI-NEXT: retq ; ; CHECK-BMI-LABEL: out64_constmask: ; CHECK-BMI: # %bb.0: -; CHECK-BMI-NEXT: movl %edi, %edi +; CHECK-BMI-NEXT: movl %edi, %ecx ; CHECK-BMI-NEXT: movabsq $-4294967296, %rax # imm = 0xFFFFFFFF00000000 ; CHECK-BMI-NEXT: andq %rsi, %rax -; CHECK-BMI-NEXT: orq %rdi, %rax +; CHECK-BMI-NEXT: orq %rcx, %rax ; CHECK-BMI-NEXT: retq %mx = and i64 %x, 4294967295 %my = and i64 %y, -4294967296